liblloyal 1.0.0
Branched Inference for llama.cpp
Loading...
Searching...
No Matches
chat_in.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
6
40#include "common.hpp"
41#include "tokenizer.hpp"
42#include <llama/llama.h>
43#include <chat.h> // llama.cpp common library: common_chat_templates_*
44#include <nlohmann/json.hpp>
45#include <algorithm>
46#include <exception>
47#include <string>
48#include <vector>
49
57namespace lloyal::chat_in {
58
66 std::string messages_json;
67 std::string template_override = "";
69 std::string tools_json = "";
70 std::string tool_choice = "auto";
71 bool parallel_tool_calls = false;
72 std::string reasoning_format = "none";
73 bool enable_thinking = true;
74 std::string json_schema = "";
75 std::string grammar = "";
76};
77
85 // Core output
86 std::string prompt;
87 std::vector<std::string> additional_stops;
88
89 // Format awareness (all fields from common_chat_params)
90 common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
91 std::string grammar;
92 bool grammar_lazy = false;
93 std::string generation_prompt;
94 std::vector<common_grammar_trigger> grammar_triggers;
95 std::vector<std::string> preserved_tokens;
96 std::string parser;
97
98 // Carried through for chat_out pairing
99 common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
100};
101
136inline FormatResult format(const llama_model *model, const FormatInputs& inputs) {
137 FormatResult result;
138
139 try {
140 using json = nlohmann::ordered_json;
141 json messages_array = json::parse(inputs.messages_json);
142
143 // Initialize templates from model (or override)
144 common_chat_templates_ptr tmpls = common_chat_templates_init(model, inputs.template_override);
145 if (!tmpls) {
146 LLOYAL_LOG_DEBUG("[chat_in::format] Template init failed, using fallback");
147 goto fallback;
148 }
149
150 {
151 // Parse messages
152 std::vector<common_chat_msg> messages = common_chat_msgs_parse_oaicompat(messages_array);
153
154 // Build full template inputs
155 common_chat_templates_inputs tmpl_inputs;
156 tmpl_inputs.messages = messages;
157 tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
158 tmpl_inputs.use_jinja = true;
159
160 // Tools
161 if (!inputs.tools_json.empty()) {
162 json tools_array = json::parse(inputs.tools_json);
163 tmpl_inputs.tools = common_chat_tools_parse_oaicompat(tools_array);
164 tmpl_inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(inputs.tool_choice);
165 tmpl_inputs.parallel_tool_calls = inputs.parallel_tool_calls;
166 }
167
168 // Reasoning
169 tmpl_inputs.reasoning_format = common_reasoning_format_from_name(inputs.reasoning_format);
170 tmpl_inputs.enable_thinking = inputs.enable_thinking;
171
172 // Structured output
173 tmpl_inputs.json_schema = inputs.json_schema;
174 tmpl_inputs.grammar = inputs.grammar;
175
176 // Apply template
177 common_chat_params params = common_chat_templates_apply(tmpls.get(), tmpl_inputs);
178
179 // Implicit empty system prompt stripping: if messages[0] is {system, ""},
180 // strip the resulting empty system block from the output. This lets callers
181 // suppress template auto-injection (e.g. SmolLM2/ChatML) by prepending an
182 // empty system message — the library completes the intent by removing the
183 // rendered empty block, leaving only the user+assistant portion.
184 if (!messages.empty() && messages[0].role == "system" && messages[0].content.empty()) {
185 bool stripped = false;
186
187 // Primary: format [{system:""}] to learn the empty system prefix
188 try {
189 common_chat_msg sys_msg;
190 sys_msg.role = "system";
191 sys_msg.content = "";
192
193 common_chat_templates_inputs sys_inputs;
194 sys_inputs.messages = {sys_msg};
195 sys_inputs.add_generation_prompt = false;
196 sys_inputs.use_jinja = true;
197 auto sys_params = common_chat_templates_apply(tmpls.get(), sys_inputs);
198
199 const auto& sys_prefix = sys_params.prompt;
200 if (!sys_prefix.empty() &&
201 params.prompt.size() >= sys_prefix.size() &&
202 params.prompt.substr(0, sys_prefix.size()) == sys_prefix) {
203 params.prompt = params.prompt.substr(sys_prefix.size());
204 stripped = true;
205 LLOYAL_LOG_DEBUG("[chat_in::format] Stripped empty system prefix (%zu bytes)", sys_prefix.size());
206 }
207 } catch (const std::exception &e) {
208 LLOYAL_LOG_DEBUG("[chat_in::format] Primary stripping failed: %s", e.what());
209 }
210
211 // Sentinel fallback: template requires a user message (e.g. Qwen 3.5).
212 // Format [{system:""}, {user:SENTINEL}] and [{user:SENTINEL}], subtract
213 // to learn the empty system prefix.
214 if (!stripped) {
215 try {
216 static const std::string SENTINEL = "\x1F__LLOYAL_SYS_STRIP__\x1F";
217
218 common_chat_msg sys_msg;
219 sys_msg.role = "system";
220 sys_msg.content = "";
221 common_chat_msg user_msg;
222 user_msg.role = "user";
223 user_msg.content = SENTINEL;
224
225 common_chat_templates_inputs with_sys;
226 with_sys.messages = {sys_msg, user_msg};
227 with_sys.add_generation_prompt = false;
228 with_sys.use_jinja = true;
229 auto with_sys_params = common_chat_templates_apply(tmpls.get(), with_sys);
230
231 common_chat_templates_inputs without_sys;
232 without_sys.messages = {user_msg};
233 without_sys.add_generation_prompt = false;
234 without_sys.use_jinja = true;
235 auto without_sys_params = common_chat_templates_apply(tmpls.get(), without_sys);
236
237 const auto& with_prompt = with_sys_params.prompt;
238 const auto& without_prompt = without_sys_params.prompt;
239
240 // If with_sys ends with without_sys, the prefix is the difference
241 if (with_prompt.size() > without_prompt.size() &&
242 with_prompt.substr(with_prompt.size() - without_prompt.size()) == without_prompt) {
243 std::string sys_prefix = with_prompt.substr(0, with_prompt.size() - without_prompt.size());
244 if (!sys_prefix.empty() &&
245 params.prompt.size() >= sys_prefix.size() &&
246 params.prompt.substr(0, sys_prefix.size()) == sys_prefix) {
247 params.prompt = params.prompt.substr(sys_prefix.size());
248 LLOYAL_LOG_DEBUG("[chat_in::format] Stripped empty system prefix via sentinel (%zu bytes)", sys_prefix.size());
249 }
250 } else {
251 LLOYAL_LOG_DEBUG("[chat_in::format] Sentinel subtraction failed, skipping strip");
252 }
253 } catch (const std::exception &e) {
254 LLOYAL_LOG_DEBUG("[chat_in::format] Sentinel stripping also failed: %s", e.what());
255 }
256 }
257 }
258
259 // Populate ALL result fields from common_chat_params
260 result.prompt = params.prompt;
261 result.additional_stops = params.additional_stops;
262 result.format = params.format;
263 result.grammar = params.grammar;
264 result.grammar_lazy = params.grammar_lazy;
265 result.generation_prompt = params.generation_prompt;
266 result.grammar_triggers = params.grammar_triggers;
267 result.preserved_tokens = params.preserved_tokens;
268 result.parser = params.parser;
269
270 // Carry reasoning_format through for chat_out pairing
271 result.reasoning_format = tmpl_inputs.reasoning_format;
272
274 "[chat_in::format] Successfully formatted with format=%d, %zu stop tokens, grammar=%zu bytes",
275 static_cast<int>(result.format),
276 result.additional_stops.size(),
277 result.grammar.size());
278 return result;
279 }
280
281 } catch (const std::exception &e) {
282 LLOYAL_LOG_DEBUG("[chat_in::format] Template processing failed: %s", e.what());
283
284 // Retry with synthetic user: templates like Qwen 3.5 require a user message.
285 // Inject a sentinel user message, re-apply the template, then strip the
286 // sentinel user turn from the output to recover just the system/tool portion.
287 try {
288 using json = nlohmann::ordered_json;
289 json messages_array = json::parse(inputs.messages_json);
290
291 common_chat_templates_ptr tmpls = common_chat_templates_init(model, inputs.template_override);
292 if (tmpls) {
293 static const std::string SENTINEL = "\x1F__LLOYAL_RETRY__\x1F";
294
295 std::vector<common_chat_msg> messages = common_chat_msgs_parse_oaicompat(messages_array);
296
297 // Check that no user role exists (otherwise the failure was something else)
298 bool has_user = false;
299 for (const auto& m : messages) {
300 if (m.role == "user") { has_user = true; break; }
301 }
302
303 if (!has_user) {
304 // Build augmented messages: original + synthetic user
305 std::vector<common_chat_msg> augmented = messages;
306 common_chat_msg sentinel_user;
307 sentinel_user.role = "user";
308 sentinel_user.content = SENTINEL;
309 augmented.push_back(sentinel_user);
310
311 common_chat_templates_inputs tmpl_inputs;
312 tmpl_inputs.messages = augmented;
313 tmpl_inputs.add_generation_prompt = false;
314 tmpl_inputs.use_jinja = true;
315
316 // Carry over tools so the system block includes tool definitions
317 if (!inputs.tools_json.empty()) {
318 json tools_array = json::parse(inputs.tools_json);
319 tmpl_inputs.tools = common_chat_tools_parse_oaicompat(tools_array);
320 tmpl_inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(inputs.tool_choice);
321 tmpl_inputs.parallel_tool_calls = inputs.parallel_tool_calls;
322 }
323 tmpl_inputs.reasoning_format = common_reasoning_format_from_name(inputs.reasoning_format);
324 tmpl_inputs.enable_thinking = inputs.enable_thinking;
325 tmpl_inputs.json_schema = inputs.json_schema;
326 tmpl_inputs.grammar = inputs.grammar;
327
328 common_chat_params params = common_chat_templates_apply(tmpls.get(), tmpl_inputs);
329
330 // Also format just [{user:SENTINEL}] to learn its rendered form
331 common_chat_msg user_only_msg;
332 user_only_msg.role = "user";
333 user_only_msg.content = SENTINEL;
334
335 common_chat_templates_inputs user_only_inputs;
336 user_only_inputs.messages = {user_only_msg};
337 user_only_inputs.add_generation_prompt = false;
338 user_only_inputs.use_jinja = true;
339 auto user_only_params = common_chat_templates_apply(tmpls.get(), user_only_inputs);
340
341 const auto& full = params.prompt;
342 const auto& user_suffix = user_only_params.prompt;
343
344 // Strip the sentinel user turn from the end
345 if (full.size() > user_suffix.size() &&
346 full.substr(full.size() - user_suffix.size()) == user_suffix) {
347 params.prompt = full.substr(0, full.size() - user_suffix.size());
348
349 // Strip empty system block if messages[0] is {system, ""}
350 if (!messages.empty() && messages[0].role == "system" && messages[0].content.empty()) {
351 // Use sentinel subtraction: [{system:""}, {user:S}] minus [{user:S}]
352 common_chat_msg sys_msg; sys_msg.role = "system"; sys_msg.content = "";
353 common_chat_msg usr_msg; usr_msg.role = "user"; usr_msg.content = SENTINEL;
354
355 common_chat_templates_inputs with_sys_inputs;
356 with_sys_inputs.messages = {sys_msg, usr_msg};
357 with_sys_inputs.add_generation_prompt = false;
358 with_sys_inputs.use_jinja = true;
359
360 common_chat_templates_inputs without_sys_inputs;
361 without_sys_inputs.messages = {usr_msg};
362 without_sys_inputs.add_generation_prompt = false;
363 without_sys_inputs.use_jinja = true;
364
365 try {
366 auto with_sys = common_chat_templates_apply(tmpls.get(), with_sys_inputs);
367 auto without_sys = common_chat_templates_apply(tmpls.get(), without_sys_inputs);
368 if (with_sys.prompt.size() > without_sys.prompt.size() &&
369 with_sys.prompt.substr(with_sys.prompt.size() - without_sys.prompt.size()) == without_sys.prompt) {
370 std::string sys_prefix = with_sys.prompt.substr(0, with_sys.prompt.size() - without_sys.prompt.size());
371 if (!sys_prefix.empty() &&
372 params.prompt.size() >= sys_prefix.size() &&
373 params.prompt.substr(0, sys_prefix.size()) == sys_prefix) {
374 params.prompt = params.prompt.substr(sys_prefix.size());
375 LLOYAL_LOG_DEBUG("[chat_in::format] Retry: stripped empty system prefix (%zu bytes)", sys_prefix.size());
376 }
377 }
378 } catch (...) {
379 // Stripping failed — proceed without it
380 }
381 }
382
383 result.prompt = params.prompt;
384 result.additional_stops = params.additional_stops;
385 result.format = params.format;
386 result.grammar = params.grammar;
387 result.grammar_lazy = params.grammar_lazy;
388 result.generation_prompt = params.generation_prompt;
389 result.grammar_triggers = params.grammar_triggers;
390 result.preserved_tokens = params.preserved_tokens;
391 result.parser = params.parser;
392 result.reasoning_format = tmpl_inputs.reasoning_format;
393
395 "[chat_in::format] Retry with synthetic user succeeded, format=%d (%zu bytes)",
396 static_cast<int>(result.format), result.prompt.size());
397 return result;
398 } else {
399 LLOYAL_LOG_DEBUG("[chat_in::format] Retry sentinel subtraction failed");
400 }
401 }
402 }
403 } catch (const std::exception &e2) {
404 LLOYAL_LOG_DEBUG("[chat_in::format] Retry also failed: %s", e2.what());
405 }
406 }
407
408fallback:
409 // Fallback to simple "role: content" format
410 try {
411 using json = nlohmann::ordered_json;
412 json messages = json::parse(inputs.messages_json);
413 std::string fallback_prompt;
414 for (const auto &msg : messages) {
415 if (msg.contains("role") && msg.contains("content")) {
416 std::string role = msg["role"].get<std::string>();
417 std::string content;
418 const auto& c = msg["content"];
419 if (c.is_null()) {
420 content = "";
421 } else if (c.is_string()) {
422 content = c.get<std::string>();
423 } else {
424 content = c.dump();
425 }
426 fallback_prompt += role + ": " + content + "\n";
427 }
428 }
429
430 result.prompt = fallback_prompt;
431 result.additional_stops = {};
432
434 "[chat_in::format] Using fallback format (%zu bytes)",
435 fallback_prompt.size());
436 return result;
437
438 } catch (const std::exception &e) {
440 "[chat_in::format] ERROR: Failed to parse messages JSON: %s",
441 e.what());
442 result.prompt = "";
443 result.additional_stops = {};
444 return result;
445 }
446}
447
459inline bool validate(const std::string &template_str) {
460 try {
461 bool isValid = common_chat_verify_template(template_str, /* use_jinja */ true);
462 LLOYAL_LOG_DEBUG("[chat_in::validate] Template validation: %s",
463 isValid ? "valid" : "invalid");
464 return isValid;
465 } catch (const std::exception &e) {
466 LLOYAL_LOG_DEBUG("[chat_in::validate] ERROR: %s", e.what());
467 return false;
468 }
469}
470
480inline std::vector<llama_token> fallback_to_eog(const llama_model* model) {
481 if (model == nullptr) {
482 return {};
483 }
484 const llama_vocab* vocab = llama_model_get_vocab(model);
485 llama_token eot = llama_vocab_eot(vocab);
486 if (eot == LLAMA_TOKEN_NULL) {
487 eot = llama_vocab_eos(vocab);
488 }
489 if (eot != LLAMA_TOKEN_NULL) {
490 return {eot};
491 }
492 return {};
493}
494
502inline std::string get_token_safe(const llama_model *model, llama_token token) {
503 if (!model || token == LLAMA_TOKEN_NULL) {
504 return "";
505 }
506 return lloyal::tokenizer::detokenize(model, token);
507}
508
549inline std::vector<llama_token> get_turn_separator(const llama_model* model) {
550 using json = nlohmann::ordered_json;
551
552 if (!model) return {};
553
554 // Collision-resistant sentinels
555 const std::string SENTINEL = "\x1F__LLOYAL_SEP__\x1F";
556 const std::string SENTINEL2 = "\x1F__LLOYAL_SEP2__\x1F";
557
558 try {
559 // Initialize templates from model
560 common_chat_templates_ptr tmpls = common_chat_templates_init(model, "");
561 if (!tmpls) {
562 return fallback_to_eog(model);
563 }
564
565 // 3-message probe: captures REAL assistant→user boundary
566 std::vector<common_chat_msg> messages = {
567 {.role = "user", .content = "X"},
568 {.role = "assistant", .content = SENTINEL},
569 {.role = "user", .content = SENTINEL2}
570 };
571
572 common_chat_templates_inputs inputs;
573 inputs.messages = messages;
574 inputs.add_generation_prompt = false; // Don't add assistant prompt at end
575 inputs.use_jinja = true;
576
577 auto params = common_chat_templates_apply(tmpls.get(), inputs);
578 const std::string& formatted = params.prompt;
579
580 // Extract substring between sentinels
581 size_t sep_start = formatted.rfind(SENTINEL);
582 if (sep_start == std::string::npos) {
583 return fallback_to_eog(model);
584 }
585 sep_start += SENTINEL.length();
586
587 size_t sep_end = formatted.find(SENTINEL2, sep_start);
588 if (sep_end == std::string::npos) {
589 return fallback_to_eog(model);
590 }
591
592 std::string between = formatted.substr(sep_start, sep_end - sep_start);
593 if (between.empty()) {
594 return fallback_to_eog(model);
595 }
596
597 // Tokenize with parse_special=true
598 const auto* vocab = llama_model_get_vocab(model);
599 std::vector<llama_token> tokens = lloyal::tokenizer::tokenize(vocab, between, false, true);
600 if (tokens.empty()) {
601 return fallback_to_eog(model);
602 }
603
604 // Extract: everything up to and including EOG + trailing whitespace
605 std::vector<llama_token> separator;
606 bool found_eog = false;
607
608 for (auto tok : tokens) {
609 if (!found_eog) {
610 separator.push_back(tok);
611 if (lloyal::tokenizer::is_eog(model, tok)) {
612 found_eog = true;
613 }
614 } else {
615 // After EOG, only keep whitespace tokens
616 std::string text = lloyal::tokenizer::detokenize(model, tok);
617 bool is_whitespace = !text.empty() && std::all_of(text.begin(), text.end(),
618 [](unsigned char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; });
619 if (is_whitespace) {
620 separator.push_back(tok);
621 } else {
622 break; // Non-whitespace = next message opener, stop
623 }
624 }
625 }
626
627 if (separator.empty() || !found_eog) {
628 return fallback_to_eog(model);
629 }
630
631 return separator;
632
633 } catch (const std::exception& e) {
634 LLOYAL_LOG_DEBUG("[chat_in::get_turn_separator] Error: %s", e.what());
635 return fallback_to_eog(model);
636 }
637}
638
639} // namespace lloyal::chat_in
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:48
Chat input formatting with full format awareness.
Definition chat_in.hpp:57
std::string get_token_safe(const llama_model *model, llama_token token)
Get token text safely.
Definition chat_in.hpp:502
std::vector< llama_token > get_turn_separator(const llama_model *model)
Get turn separator tokens for the model's chat template.
Definition chat_in.hpp:549
FormatResult format(const llama_model *model, const FormatInputs &inputs)
Format chat messages using model's chat template with full format awareness.
Definition chat_in.hpp:136
std::vector< llama_token > fallback_to_eog(const llama_model *model)
Get EOG token as fallback when template parsing fails.
Definition chat_in.hpp:480
bool validate(const std::string &template_str)
Validate chat template syntax.
Definition chat_in.hpp:459
std::vector< llama_token > tokenize(const llama_vocab *vocab, const std::string &text, bool add_special, bool parse_special)
Tokenize text to token array.
Definition tokenizer.hpp:39
std::string detokenize(const llama_vocab *vocab, llama_token token, bool special)
Detokenize SINGLE token to text (streaming use case)
Definition tokenizer.hpp:92
bool is_eog(const llama_vocab *vocab, llama_token token)
Check if token is end-of-generation marker.
Input parameters for chat formatting.
Definition chat_in.hpp:65
std::string messages_json
JSON array of OpenAI-format messages (required)
Definition chat_in.hpp:66
bool add_generation_prompt
Append assistant prompt prefix (set false for partial formatting)
Definition chat_in.hpp:68
std::string tools_json
JSON array of OpenAI-format tool definitions.
Definition chat_in.hpp:69
std::string json_schema
JSON schema for structured output.
Definition chat_in.hpp:74
std::string grammar
Explicit GBNF grammar string.
Definition chat_in.hpp:75
std::string template_override
Optional Jinja2 template override.
Definition chat_in.hpp:67
bool enable_thinking
Enable <think> blocks (pairs with reasoning_format)
Definition chat_in.hpp:73
std::string reasoning_format
"none" | "auto" | "deepseek" | "deepseek_legacy"
Definition chat_in.hpp:72
std::string tool_choice
"auto" | "required" | "none"
Definition chat_in.hpp:70
bool parallel_tool_calls
Allow parallel tool calls.
Definition chat_in.hpp:71
Result from chat template formatting with full format awareness.
Definition chat_in.hpp:84
std::vector< common_grammar_trigger > grammar_triggers
Triggers for lazy grammar activation.
Definition chat_in.hpp:94
common_reasoning_format reasoning_format
Reasoning format for output parsing.
Definition chat_in.hpp:99
std::vector< std::string > additional_stops
Stop tokens extracted from template.
Definition chat_in.hpp:87
std::string prompt
Formatted prompt text ready for tokenization.
Definition chat_in.hpp:86
std::string generation_prompt
Generation prompt prefill (e.g. "<think>")
Definition chat_in.hpp:93
bool grammar_lazy
Whether grammar should use lazy compilation.
Definition chat_in.hpp:92
std::string grammar
GBNF grammar for constrained sampling.
Definition chat_in.hpp:91
std::vector< std::string > preserved_tokens
Tokens to preserve during grammar constraining.
Definition chat_in.hpp:95
std::string parser
PEG parser definition (for PEG formats)
Definition chat_in.hpp:96
common_chat_format format
Detected chat format.
Definition chat_in.hpp:90
Text Tokenization Operations.