liblloyal 1.0.0
Branched Inference for llama.cpp
Loading...
Searching...
No Matches
chat_in.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
39#include "common.hpp"
40#include "tokenizer.hpp"
41#include <llama/llama.h>
42#include <chat.h> // llama.cpp common library: common_chat_templates_*
43#include <nlohmann/json.hpp>
44#include <algorithm>
45#include <exception>
46#include <string>
47#include <vector>
48
56namespace lloyal::chat_in {
57
65 std::string messages_json;
66 std::string template_override = "";
68 std::string tools_json = "";
69 std::string tool_choice = "auto";
70 bool parallel_tool_calls = false;
71 std::string reasoning_format = "none";
72 bool enable_thinking = true;
73 std::string json_schema = "";
74 std::string grammar = "";
75};
76
84 // Core output
85 std::string prompt;
86 std::vector<std::string> additional_stops;
87
88 // Format awareness (all fields from common_chat_params)
89 common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
90 std::string grammar;
91 bool grammar_lazy = false;
92 bool thinking_forced_open = false;
93 std::vector<common_grammar_trigger> grammar_triggers;
94 std::vector<std::string> preserved_tokens;
95 std::string parser;
96
97 // Carried through for chat_out pairing
98 common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
99};
100
135inline FormatResult format(const llama_model *model, const FormatInputs& inputs) {
136 FormatResult result;
137
138 try {
139 using json = nlohmann::ordered_json;
140 json messages_array = json::parse(inputs.messages_json);
141
142 // Initialize templates from model (or override)
143 common_chat_templates_ptr tmpls = common_chat_templates_init(model, inputs.template_override);
144 if (!tmpls) {
145 LLOYAL_LOG_DEBUG("[chat_in::format] Template init failed, using fallback");
146 goto fallback;
147 }
148
149 {
150 // Parse messages
151 std::vector<common_chat_msg> messages = common_chat_msgs_parse_oaicompat(messages_array);
152
153 // Build full template inputs
154 common_chat_templates_inputs tmpl_inputs;
155 tmpl_inputs.messages = messages;
156 tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
157 tmpl_inputs.use_jinja = true;
158
159 // Tools
160 if (!inputs.tools_json.empty()) {
161 json tools_array = json::parse(inputs.tools_json);
162 tmpl_inputs.tools = common_chat_tools_parse_oaicompat(tools_array);
163 tmpl_inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(inputs.tool_choice);
164 tmpl_inputs.parallel_tool_calls = inputs.parallel_tool_calls;
165 }
166
167 // Reasoning
168 tmpl_inputs.reasoning_format = common_reasoning_format_from_name(inputs.reasoning_format);
169 tmpl_inputs.enable_thinking = inputs.enable_thinking;
170
171 // Structured output
172 tmpl_inputs.json_schema = inputs.json_schema;
173 tmpl_inputs.grammar = inputs.grammar;
174
175 // Apply template
176 common_chat_params params = common_chat_templates_apply(tmpls.get(), tmpl_inputs);
177
178 // Implicit empty system prompt stripping: if messages[0] is {system, ""},
179 // strip the resulting empty system block from the output. This lets callers
180 // suppress template auto-injection (e.g. SmolLM2/ChatML) by prepending an
181 // empty system message — the library completes the intent by removing the
182 // rendered empty block, leaving only the user+assistant portion.
183 if (!messages.empty() && messages[0].role == "system" && messages[0].content.empty()) {
184 common_chat_msg sys_msg;
185 sys_msg.role = "system";
186 sys_msg.content = "";
187
188 common_chat_templates_inputs sys_inputs;
189 sys_inputs.messages = {sys_msg};
190 sys_inputs.add_generation_prompt = false;
191 sys_inputs.use_jinja = true;
192 auto sys_params = common_chat_templates_apply(tmpls.get(), sys_inputs);
193
194 const auto& sys_prefix = sys_params.prompt;
195 if (!sys_prefix.empty() &&
196 params.prompt.size() >= sys_prefix.size() &&
197 params.prompt.substr(0, sys_prefix.size()) == sys_prefix) {
198 params.prompt = params.prompt.substr(sys_prefix.size());
199 LLOYAL_LOG_DEBUG("[chat_in::format] Stripped empty system prefix (%zu bytes)", sys_prefix.size());
200 }
201 }
202
203 // Populate ALL result fields from common_chat_params
204 result.prompt = params.prompt;
205 result.additional_stops = params.additional_stops;
206 result.format = params.format;
207 result.grammar = params.grammar;
208 result.grammar_lazy = params.grammar_lazy;
209 result.thinking_forced_open = params.thinking_forced_open;
210 result.grammar_triggers = params.grammar_triggers;
211 result.preserved_tokens = params.preserved_tokens;
212 result.parser = params.parser;
213
214 // Carry reasoning_format through for chat_out pairing
215 result.reasoning_format = tmpl_inputs.reasoning_format;
216
218 "[chat_in::format] Successfully formatted with format=%d, %zu stop tokens, grammar=%zu bytes",
219 static_cast<int>(result.format),
220 result.additional_stops.size(),
221 result.grammar.size());
222 return result;
223 }
224
225 } catch (const std::exception &e) {
226 LLOYAL_LOG_DEBUG("[chat_in::format] Template processing failed: %s", e.what());
227 }
228
229fallback:
230 // Fallback to simple "role: content" format
231 try {
232 using json = nlohmann::ordered_json;
233 json messages = json::parse(inputs.messages_json);
234 std::string fallback_prompt;
235 for (const auto &msg : messages) {
236 if (msg.contains("role") && msg.contains("content")) {
237 std::string role = msg["role"].get<std::string>();
238 std::string content;
239 const auto& c = msg["content"];
240 if (c.is_null()) {
241 content = "";
242 } else if (c.is_string()) {
243 content = c.get<std::string>();
244 } else {
245 content = c.dump();
246 }
247 fallback_prompt += role + ": " + content + "\n";
248 }
249 }
250
251 result.prompt = fallback_prompt;
252 result.additional_stops = {};
253
255 "[chat_in::format] Using fallback format (%zu bytes)",
256 fallback_prompt.size());
257 return result;
258
259 } catch (const std::exception &e) {
261 "[chat_in::format] ERROR: Failed to parse messages JSON: %s",
262 e.what());
263 result.prompt = "";
264 result.additional_stops = {};
265 return result;
266 }
267}
268
280inline bool validate(const std::string &template_str) {
281 try {
282 bool isValid = common_chat_verify_template(template_str, /* use_jinja */ true);
283 LLOYAL_LOG_DEBUG("[chat_in::validate] Template validation: %s",
284 isValid ? "valid" : "invalid");
285 return isValid;
286 } catch (const std::exception &e) {
287 LLOYAL_LOG_DEBUG("[chat_in::validate] ERROR: %s", e.what());
288 return false;
289 }
290}
291
301inline std::vector<llama_token> fallback_to_eog(const llama_model* model) {
302 if (model == nullptr) {
303 return {};
304 }
305 const llama_vocab* vocab = llama_model_get_vocab(model);
306 llama_token eot = llama_vocab_eot(vocab);
307 if (eot == LLAMA_TOKEN_NULL) {
308 eot = llama_vocab_eos(vocab);
309 }
310 if (eot != LLAMA_TOKEN_NULL) {
311 return {eot};
312 }
313 return {};
314}
315
323inline std::string get_token_safe(const llama_model *model, llama_token token) {
324 if (!model || token == LLAMA_TOKEN_NULL) {
325 return "";
326 }
327 return lloyal::tokenizer::detokenize(model, token);
328}
329
370inline std::vector<llama_token> get_turn_separator(const llama_model* model) {
371 using json = nlohmann::ordered_json;
372
373 if (!model) return {};
374
375 // Collision-resistant sentinels
376 const std::string SENTINEL = "\x1F__LLOYAL_SEP__\x1F";
377 const std::string SENTINEL2 = "\x1F__LLOYAL_SEP2__\x1F";
378
379 try {
380 // Initialize templates from model
381 common_chat_templates_ptr tmpls = common_chat_templates_init(model, "");
382 if (!tmpls) {
383 return fallback_to_eog(model);
384 }
385
386 // 3-message probe: captures REAL assistant→user boundary
387 std::vector<common_chat_msg> messages = {
388 {.role = "user", .content = "X"},
389 {.role = "assistant", .content = SENTINEL},
390 {.role = "user", .content = SENTINEL2}
391 };
392
393 common_chat_templates_inputs inputs;
394 inputs.messages = messages;
395 inputs.add_generation_prompt = false; // Don't add assistant prompt at end
396 inputs.use_jinja = true;
397
398 auto params = common_chat_templates_apply(tmpls.get(), inputs);
399 const std::string& formatted = params.prompt;
400
401 // Extract substring between sentinels
402 size_t sep_start = formatted.rfind(SENTINEL);
403 if (sep_start == std::string::npos) {
404 return fallback_to_eog(model);
405 }
406 sep_start += SENTINEL.length();
407
408 size_t sep_end = formatted.find(SENTINEL2, sep_start);
409 if (sep_end == std::string::npos) {
410 return fallback_to_eog(model);
411 }
412
413 std::string between = formatted.substr(sep_start, sep_end - sep_start);
414 if (between.empty()) {
415 return fallback_to_eog(model);
416 }
417
418 // Tokenize with parse_special=true
419 const auto* vocab = llama_model_get_vocab(model);
420 std::vector<llama_token> tokens = lloyal::tokenizer::tokenize(vocab, between, false, true);
421 if (tokens.empty()) {
422 return fallback_to_eog(model);
423 }
424
425 // Extract: everything up to and including EOG + trailing whitespace
426 std::vector<llama_token> separator;
427 bool found_eog = false;
428
429 for (auto tok : tokens) {
430 if (!found_eog) {
431 separator.push_back(tok);
432 if (lloyal::tokenizer::is_eog(model, tok)) {
433 found_eog = true;
434 }
435 } else {
436 // After EOG, only keep whitespace tokens
437 std::string text = lloyal::tokenizer::detokenize(model, tok);
438 bool is_whitespace = !text.empty() && std::all_of(text.begin(), text.end(),
439 [](unsigned char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; });
440 if (is_whitespace) {
441 separator.push_back(tok);
442 } else {
443 break; // Non-whitespace = next message opener, stop
444 }
445 }
446 }
447
448 if (separator.empty() || !found_eog) {
449 return fallback_to_eog(model);
450 }
451
452 return separator;
453
454 } catch (const std::exception& e) {
455 LLOYAL_LOG_DEBUG("[chat_in::get_turn_separator] Error: %s", e.what());
456 return fallback_to_eog(model);
457 }
458}
459
460} // namespace lloyal::chat_in
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:47
Chat input formatting with full format awareness.
Definition chat_in.hpp:56
std::string get_token_safe(const llama_model *model, llama_token token)
Get token text safely.
Definition chat_in.hpp:323
std::vector< llama_token > get_turn_separator(const llama_model *model)
Get turn separator tokens for the model's chat template.
Definition chat_in.hpp:370
FormatResult format(const llama_model *model, const FormatInputs &inputs)
Format chat messages using model's chat template with full format awareness.
Definition chat_in.hpp:135
std::vector< llama_token > fallback_to_eog(const llama_model *model)
Get EOG token as fallback when template parsing fails.
Definition chat_in.hpp:301
bool validate(const std::string &template_str)
Validate chat template syntax.
Definition chat_in.hpp:280
std::vector< llama_token > tokenize(const llama_vocab *vocab, const std::string &text, bool add_special, bool parse_special)
Tokenize text to token array.
Definition tokenizer.hpp:38
std::string detokenize(const llama_vocab *vocab, llama_token token, bool special)
Detokenize SINGLE token to text (streaming use case)
Definition tokenizer.hpp:91
bool is_eog(const llama_vocab *vocab, llama_token token)
Check if token is end-of-generation marker.
Input parameters for chat formatting.
Definition chat_in.hpp:64
std::string messages_json
JSON array of OpenAI-format messages (required)
Definition chat_in.hpp:65
bool add_generation_prompt
Append assistant prompt prefix (set false for partial formatting)
Definition chat_in.hpp:67
std::string tools_json
JSON array of OpenAI-format tool definitions.
Definition chat_in.hpp:68
std::string json_schema
JSON schema for structured output.
Definition chat_in.hpp:73
std::string grammar
Explicit GBNF grammar string.
Definition chat_in.hpp:74
std::string template_override
Optional Jinja2 template override.
Definition chat_in.hpp:66
bool enable_thinking
Enable <think> blocks (pairs with reasoning_format)
Definition chat_in.hpp:72
std::string reasoning_format
"none" | "auto" | "deepseek" | "deepseek_legacy"
Definition chat_in.hpp:71
std::string tool_choice
"auto" | "required" | "none"
Definition chat_in.hpp:69
bool parallel_tool_calls
Allow parallel tool calls.
Definition chat_in.hpp:70
Result from chat template formatting with full format awareness.
Definition chat_in.hpp:83
std::vector< common_grammar_trigger > grammar_triggers
Triggers for lazy grammar activation.
Definition chat_in.hpp:93
common_reasoning_format reasoning_format
Reasoning format for output parsing.
Definition chat_in.hpp:98
std::vector< std::string > additional_stops
Stop tokens extracted from template.
Definition chat_in.hpp:86
bool thinking_forced_open
Whether thinking tag is forced open.
Definition chat_in.hpp:92
std::string prompt
Formatted prompt text ready for tokenization.
Definition chat_in.hpp:85
bool grammar_lazy
Whether grammar should use lazy compilation.
Definition chat_in.hpp:91
std::string grammar
GBNF grammar for constrained sampling.
Definition chat_in.hpp:90
std::vector< std::string > preserved_tokens
Tokens to preserve during grammar constraining.
Definition chat_in.hpp:94
std::string parser
PEG parser definition (for PEG formats)
Definition chat_in.hpp:95
common_chat_format format
Detected chat format.
Definition chat_in.hpp:89
Text Tokenization Operations.