liblloyal/chat__in_8hpp_source.html

#pragma once


// SPDX-License-Identifier: Apache-2.0

// Copyright 2026 Lloyal Labs


#include "common.hpp"

#include "tokenizer.hpp"

#include <llama/llama.h>

#include <chat.h>         // llama.cpp common library: common_chat_templates_*

#include <nlohmann/json.hpp>

#include <algorithm>

#include <exception>

#include <string>

#include <vector>


namespace lloyal::chat_in {


struct FormatInputs {

  std::string messages_json;

  std::string template_override = "";

  bool add_generation_prompt = true;

  std::string tools_json = "";

  std::string tool_choice = "auto";

  bool parallel_tool_calls = false;

  std::string reasoning_format = "none";

  bool enable_thinking = true;

  std::string json_schema = "";

  std::string grammar = "";

};


struct FormatResult {

  // Core output

  std::string prompt;

  std::vector<std::string> additional_stops;


  // Format awareness (all fields from common_chat_params)

  common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;

  std::string grammar;

  bool grammar_lazy = false;

  std::string generation_prompt;

  std::vector<common_grammar_trigger> grammar_triggers;

  std::vector<std::string> preserved_tokens;

  std::string parser;


  // Carried through for chat_out pairing

  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;

};


inline FormatResult format(const llama_model *model, const FormatInputs& inputs) {

  FormatResult result;


  try {

    using json = nlohmann::ordered_json;

    json messages_array = json::parse(inputs.messages_json);


    // Initialize templates from model (or override)

    common_chat_templates_ptr tmpls = common_chat_templates_init(model, inputs.template_override);

    if (!tmpls) {

      LLOYAL_LOG_DEBUG("[chat_in::format] Template init failed, using fallback");

      goto fallback;

    }


    {

      // Parse messages

      std::vector<common_chat_msg> messages = common_chat_msgs_parse_oaicompat(messages_array);


      // Build full template inputs

      common_chat_templates_inputs tmpl_inputs;

      tmpl_inputs.messages = messages;

      tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;

      tmpl_inputs.use_jinja = true;


      // Tools

      if (!inputs.tools_json.empty()) {

        json tools_array = json::parse(inputs.tools_json);

        tmpl_inputs.tools = common_chat_tools_parse_oaicompat(tools_array);

        tmpl_inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(inputs.tool_choice);

        tmpl_inputs.parallel_tool_calls = inputs.parallel_tool_calls;

      }


      // Reasoning

      tmpl_inputs.reasoning_format = common_reasoning_format_from_name(inputs.reasoning_format);

      tmpl_inputs.enable_thinking = inputs.enable_thinking;


      // Structured output

      tmpl_inputs.json_schema = inputs.json_schema;

      tmpl_inputs.grammar = inputs.grammar;


      // Apply template

      common_chat_params params = common_chat_templates_apply(tmpls.get(), tmpl_inputs);


      // Implicit empty system prompt stripping: if messages[0] is {system, ""},

      // strip the resulting empty system block from the output. This lets callers

      // suppress template auto-injection (e.g. SmolLM2/ChatML) by prepending an

      // empty system message — the library completes the intent by removing the

      // rendered empty block, leaving only the user+assistant portion.

      if (!messages.empty() && messages[0].role == "system" && messages[0].content.empty()) {

        bool stripped = false;


        // Primary: format [{system:""}] to learn the empty system prefix

        try {

          common_chat_msg sys_msg;

          sys_msg.role = "system";

          sys_msg.content = "";


          common_chat_templates_inputs sys_inputs;

          sys_inputs.messages = {sys_msg};

          sys_inputs.add_generation_prompt = false;

          sys_inputs.use_jinja = true;

          auto sys_params = common_chat_templates_apply(tmpls.get(), sys_inputs);


          const auto& sys_prefix = sys_params.prompt;

          if (!sys_prefix.empty() &&

              params.prompt.size() >= sys_prefix.size() &&

              params.prompt.substr(0, sys_prefix.size()) == sys_prefix) {

            params.prompt = params.prompt.substr(sys_prefix.size());

            stripped = true;

            LLOYAL_LOG_DEBUG("[chat_in::format] Stripped empty system prefix (%zu bytes)", sys_prefix.size());

          }

        } catch (const std::exception &e) {

          LLOYAL_LOG_DEBUG("[chat_in::format] Primary stripping failed: %s", e.what());

        }


        // Sentinel fallback: template requires a user message (e.g. Qwen 3.5).

        // Format [{system:""}, {user:SENTINEL}] and [{user:SENTINEL}], subtract

        // to learn the empty system prefix.

        if (!stripped) {

          try {

            static const std::string SENTINEL = "\x1F__LLOYAL_SYS_STRIP__\x1F";


            common_chat_msg sys_msg;

            sys_msg.role = "system";

            sys_msg.content = "";

            common_chat_msg user_msg;

            user_msg.role = "user";

            user_msg.content = SENTINEL;


            common_chat_templates_inputs with_sys;

            with_sys.messages = {sys_msg, user_msg};

            with_sys.add_generation_prompt = false;

            with_sys.use_jinja = true;

            auto with_sys_params = common_chat_templates_apply(tmpls.get(), with_sys);


            common_chat_templates_inputs without_sys;

            without_sys.messages = {user_msg};

            without_sys.add_generation_prompt = false;

            without_sys.use_jinja = true;

            auto without_sys_params = common_chat_templates_apply(tmpls.get(), without_sys);


            const auto& with_prompt = with_sys_params.prompt;

            const auto& without_prompt = without_sys_params.prompt;


            // If with_sys ends with without_sys, the prefix is the difference

            if (with_prompt.size() > without_prompt.size() &&

                with_prompt.substr(with_prompt.size() - without_prompt.size()) == without_prompt) {

              std::string sys_prefix = with_prompt.substr(0, with_prompt.size() - without_prompt.size());

              if (!sys_prefix.empty() &&

                  params.prompt.size() >= sys_prefix.size() &&

                  params.prompt.substr(0, sys_prefix.size()) == sys_prefix) {

                params.prompt = params.prompt.substr(sys_prefix.size());

                LLOYAL_LOG_DEBUG("[chat_in::format] Stripped empty system prefix via sentinel (%zu bytes)", sys_prefix.size());

              }

            } else {

              LLOYAL_LOG_DEBUG("[chat_in::format] Sentinel subtraction failed, skipping strip");

            }

          } catch (const std::exception &e) {

            LLOYAL_LOG_DEBUG("[chat_in::format] Sentinel stripping also failed: %s", e.what());

          }

        }

      }


      // Populate ALL result fields from common_chat_params

      result.prompt = params.prompt;

      result.additional_stops = params.additional_stops;

      result.format = params.format;

      result.grammar = params.grammar;

      result.grammar_lazy = params.grammar_lazy;

      result.generation_prompt = params.generation_prompt;

      result.grammar_triggers = params.grammar_triggers;

      result.preserved_tokens = params.preserved_tokens;

      result.parser = params.parser;


      // Carry reasoning_format through for chat_out pairing

      result.reasoning_format = tmpl_inputs.reasoning_format;


      LLOYAL_LOG_DEBUG(

          "[chat_in::format] Successfully formatted with format=%d, %zu stop tokens, grammar=%zu bytes",

          static_cast<int>(result.format),

          result.additional_stops.size(),

          result.grammar.size());

      return result;

    }


  } catch (const std::exception &e) {

    LLOYAL_LOG_DEBUG("[chat_in::format] Template processing failed: %s", e.what());


    // Retry with synthetic user: templates like Qwen 3.5 require a user message.

    // Inject a sentinel user message, re-apply the template, then strip the

    // sentinel user turn from the output to recover just the system/tool portion.

    try {

      using json = nlohmann::ordered_json;

      json messages_array = json::parse(inputs.messages_json);


      common_chat_templates_ptr tmpls = common_chat_templates_init(model, inputs.template_override);

      if (tmpls) {

        static const std::string SENTINEL = "\x1F__LLOYAL_RETRY__\x1F";


        std::vector<common_chat_msg> messages = common_chat_msgs_parse_oaicompat(messages_array);


        // Check that no user role exists (otherwise the failure was something else)

        bool has_user = false;

        for (const auto& m : messages) {

          if (m.role == "user") { has_user = true; break; }

        }


        if (!has_user) {

          // Build augmented messages: original + synthetic user

          std::vector<common_chat_msg> augmented = messages;

          common_chat_msg sentinel_user;

          sentinel_user.role = "user";

          sentinel_user.content = SENTINEL;

          augmented.push_back(sentinel_user);


          common_chat_templates_inputs tmpl_inputs;

          tmpl_inputs.messages = augmented;

          tmpl_inputs.add_generation_prompt = false;

          tmpl_inputs.use_jinja = true;


          // Carry over tools so the system block includes tool definitions

          if (!inputs.tools_json.empty()) {

            json tools_array = json::parse(inputs.tools_json);

            tmpl_inputs.tools = common_chat_tools_parse_oaicompat(tools_array);

            tmpl_inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(inputs.tool_choice);

            tmpl_inputs.parallel_tool_calls = inputs.parallel_tool_calls;

          }

          tmpl_inputs.reasoning_format = common_reasoning_format_from_name(inputs.reasoning_format);

          tmpl_inputs.enable_thinking = inputs.enable_thinking;

          tmpl_inputs.json_schema = inputs.json_schema;

          tmpl_inputs.grammar = inputs.grammar;


          common_chat_params params = common_chat_templates_apply(tmpls.get(), tmpl_inputs);


          // Also format just [{user:SENTINEL}] to learn its rendered form

          common_chat_msg user_only_msg;

          user_only_msg.role = "user";

          user_only_msg.content = SENTINEL;


          common_chat_templates_inputs user_only_inputs;

          user_only_inputs.messages = {user_only_msg};

          user_only_inputs.add_generation_prompt = false;

          user_only_inputs.use_jinja = true;

          auto user_only_params = common_chat_templates_apply(tmpls.get(), user_only_inputs);


          const auto& full = params.prompt;

          const auto& user_suffix = user_only_params.prompt;


          // Strip the sentinel user turn from the end

          if (full.size() > user_suffix.size() &&

              full.substr(full.size() - user_suffix.size()) == user_suffix) {

            params.prompt = full.substr(0, full.size() - user_suffix.size());


            // Strip empty system block if messages[0] is {system, ""}

            if (!messages.empty() && messages[0].role == "system" && messages[0].content.empty()) {

              // Use sentinel subtraction: [{system:""}, {user:S}] minus [{user:S}]

              common_chat_msg sys_msg;  sys_msg.role = "system"; sys_msg.content = "";

              common_chat_msg usr_msg;  usr_msg.role = "user";   usr_msg.content = SENTINEL;


              common_chat_templates_inputs with_sys_inputs;

              with_sys_inputs.messages = {sys_msg, usr_msg};

              with_sys_inputs.add_generation_prompt = false;

              with_sys_inputs.use_jinja = true;


              common_chat_templates_inputs without_sys_inputs;

              without_sys_inputs.messages = {usr_msg};

              without_sys_inputs.add_generation_prompt = false;

              without_sys_inputs.use_jinja = true;


              try {

                auto with_sys = common_chat_templates_apply(tmpls.get(), with_sys_inputs);

                auto without_sys = common_chat_templates_apply(tmpls.get(), without_sys_inputs);

                if (with_sys.prompt.size() > without_sys.prompt.size() &&

                    with_sys.prompt.substr(with_sys.prompt.size() - without_sys.prompt.size()) == without_sys.prompt) {

                  std::string sys_prefix = with_sys.prompt.substr(0, with_sys.prompt.size() - without_sys.prompt.size());

                  if (!sys_prefix.empty() &&

                      params.prompt.size() >= sys_prefix.size() &&

                      params.prompt.substr(0, sys_prefix.size()) == sys_prefix) {

                    params.prompt = params.prompt.substr(sys_prefix.size());

                    LLOYAL_LOG_DEBUG("[chat_in::format] Retry: stripped empty system prefix (%zu bytes)", sys_prefix.size());

                  }

                }

              } catch (...) {

                // Stripping failed — proceed without it

              }

            }


            result.prompt = params.prompt;

            result.additional_stops = params.additional_stops;

            result.format = params.format;

            result.grammar = params.grammar;

            result.grammar_lazy = params.grammar_lazy;

            result.generation_prompt = params.generation_prompt;

            result.grammar_triggers = params.grammar_triggers;

            result.preserved_tokens = params.preserved_tokens;

            result.parser = params.parser;

            result.reasoning_format = tmpl_inputs.reasoning_format;


            LLOYAL_LOG_DEBUG(

                "[chat_in::format] Retry with synthetic user succeeded, format=%d (%zu bytes)",

                static_cast<int>(result.format), result.prompt.size());

            return result;

          } else {

            LLOYAL_LOG_DEBUG("[chat_in::format] Retry sentinel subtraction failed");

          }

        }

      }

    } catch (const std::exception &e2) {

      LLOYAL_LOG_DEBUG("[chat_in::format] Retry also failed: %s", e2.what());

    }

  }


fallback:

  // Fallback to simple "role: content" format

  try {

    using json = nlohmann::ordered_json;

    json messages = json::parse(inputs.messages_json);

    std::string fallback_prompt;

    for (const auto &msg : messages) {

      if (msg.contains("role") && msg.contains("content")) {

        std::string role = msg["role"].get<std::string>();

        std::string content;

        const auto& c = msg["content"];

        if (c.is_null()) {

          content = "";

        } else if (c.is_string()) {

          content = c.get<std::string>();

        } else {

          content = c.dump();

        }

        fallback_prompt += role + ": " + content + "\n";

      }

    }


    result.prompt = fallback_prompt;

    result.additional_stops = {};


    LLOYAL_LOG_DEBUG(

        "[chat_in::format] Using fallback format (%zu bytes)",

        fallback_prompt.size());

    return result;


  } catch (const std::exception &e) {

    LLOYAL_LOG_DEBUG(

        "[chat_in::format] ERROR: Failed to parse messages JSON: %s",

        e.what());

    result.prompt = "";

    result.additional_stops = {};

    return result;

  }

}


inline bool validate(const std::string &template_str) {

  try {

    bool isValid = common_chat_verify_template(template_str, /* use_jinja */ true);

    LLOYAL_LOG_DEBUG("[chat_in::validate] Template validation: %s",

                     isValid ? "valid" : "invalid");

    return isValid;

  } catch (const std::exception &e) {

    LLOYAL_LOG_DEBUG("[chat_in::validate] ERROR: %s", e.what());

    return false;

  }

}


inline std::vector<llama_token> fallback_to_eog(const llama_model* model) {

  if (model == nullptr) {

    return {};

  }

  const llama_vocab* vocab = llama_model_get_vocab(model);

  llama_token eot = llama_vocab_eot(vocab);

  if (eot == LLAMA_TOKEN_NULL) {

    eot = llama_vocab_eos(vocab);

  }

  if (eot != LLAMA_TOKEN_NULL) {

    return {eot};

  }

  return {};

}


inline std::string get_token_safe(const llama_model *model, llama_token token) {

  if (!model || token == LLAMA_TOKEN_NULL) {

    return "";

  }

  return lloyal::tokenizer::detokenize(model, token);

}


inline std::vector<llama_token> get_turn_separator(const llama_model* model) {

  using json = nlohmann::ordered_json;


  if (!model) return {};


  // Collision-resistant sentinels

  const std::string SENTINEL = "\x1F__LLOYAL_SEP__\x1F";

  const std::string SENTINEL2 = "\x1F__LLOYAL_SEP2__\x1F";


  try {

    // Initialize templates from model

    common_chat_templates_ptr tmpls = common_chat_templates_init(model, "");

    if (!tmpls) {

      return fallback_to_eog(model);

    }


    // 3-message probe: captures REAL assistant→user boundary

    std::vector<common_chat_msg> messages = {

      {.role = "user", .content = "X"},

      {.role = "assistant", .content = SENTINEL},

      {.role = "user", .content = SENTINEL2}

    };


    common_chat_templates_inputs inputs;

    inputs.messages = messages;

    inputs.add_generation_prompt = false;  // Don't add assistant prompt at end

    inputs.use_jinja = true;


    auto params = common_chat_templates_apply(tmpls.get(), inputs);

    const std::string& formatted = params.prompt;


    // Extract substring between sentinels

    size_t sep_start = formatted.rfind(SENTINEL);

    if (sep_start == std::string::npos) {

      return fallback_to_eog(model);

    }

    sep_start += SENTINEL.length();


    size_t sep_end = formatted.find(SENTINEL2, sep_start);

    if (sep_end == std::string::npos) {

      return fallback_to_eog(model);

    }


    std::string between = formatted.substr(sep_start, sep_end - sep_start);

    if (between.empty()) {

      return fallback_to_eog(model);

    }


    // Tokenize with parse_special=true

    const auto* vocab = llama_model_get_vocab(model);

    std::vector<llama_token> tokens = lloyal::tokenizer::tokenize(vocab, between, false, true);

    if (tokens.empty()) {

      return fallback_to_eog(model);

    }


    // Extract: everything up to and including EOG + trailing whitespace

    std::vector<llama_token> separator;

    bool found_eog = false;


    for (auto tok : tokens) {

      if (!found_eog) {

        separator.push_back(tok);

        if (lloyal::tokenizer::is_eog(model, tok)) {

          found_eog = true;

        }

      } else {

        // After EOG, only keep whitespace tokens

        std::string text = lloyal::tokenizer::detokenize(model, tok);

        bool is_whitespace = !text.empty() && std::all_of(text.begin(), text.end(),

            [](unsigned char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; });

        if (is_whitespace) {

          separator.push_back(tok);

        } else {

          break;  // Non-whitespace = next message opener, stop

        }

      }

    }


    if (separator.empty() || !found_eog) {

      return fallback_to_eog(model);

    }


    return separator;


  } catch (const std::exception& e) {

    LLOYAL_LOG_DEBUG("[chat_in::get_turn_separator] Error: %s", e.what());

    return fallback_to_eog(model);

  }

}


} // namespace lloyal::chat_in


common.hpp

LLOYAL_LOG_DEBUG
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:48

lloyal::chat_in
Chat input formatting with full format awareness.
Definition chat_in.hpp:57

lloyal::chat_in::get_token_safe
std::string get_token_safe(const llama_model *model, llama_token token)
Get token text safely.
Definition chat_in.hpp:502

lloyal::chat_in::get_turn_separator
std::vector< llama_token > get_turn_separator(const llama_model *model)
Get turn separator tokens for the model's chat template.
Definition chat_in.hpp:549

lloyal::chat_in::format
FormatResult format(const llama_model *model, const FormatInputs &inputs)
Format chat messages using model's chat template with full format awareness.
Definition chat_in.hpp:136

lloyal::chat_in::fallback_to_eog
std::vector< llama_token > fallback_to_eog(const llama_model *model)
Get EOG token as fallback when template parsing fails.
Definition chat_in.hpp:480

lloyal::chat_in::validate
bool validate(const std::string &template_str)
Validate chat template syntax.
Definition chat_in.hpp:459

lloyal::tokenizer::tokenize
std::vector< llama_token > tokenize(const llama_vocab *vocab, const std::string &text, bool add_special, bool parse_special)
Tokenize text to token array.
Definition tokenizer.hpp:39

lloyal::tokenizer::detokenize
std::string detokenize(const llama_vocab *vocab, llama_token token, bool special)
Detokenize SINGLE token to text (streaming use case)
Definition tokenizer.hpp:92

lloyal::tokenizer::is_eog
bool is_eog(const llama_vocab *vocab, llama_token token)
Check if token is end-of-generation marker.
Definition tokenizer.hpp:217

lloyal::chat_in::FormatInputs
Input parameters for chat formatting.
Definition chat_in.hpp:65

lloyal::chat_in::FormatInputs::messages_json
std::string messages_json
JSON array of OpenAI-format messages (required)
Definition chat_in.hpp:66

lloyal::chat_in::FormatInputs::add_generation_prompt
bool add_generation_prompt
Append assistant prompt prefix (set false for partial formatting)
Definition chat_in.hpp:68

lloyal::chat_in::FormatInputs::tools_json
std::string tools_json
JSON array of OpenAI-format tool definitions.
Definition chat_in.hpp:69

lloyal::chat_in::FormatInputs::json_schema
std::string json_schema
JSON schema for structured output.
Definition chat_in.hpp:74

lloyal::chat_in::FormatInputs::grammar
std::string grammar
Explicit GBNF grammar string.
Definition chat_in.hpp:75

lloyal::chat_in::FormatInputs::template_override
std::string template_override
Optional Jinja2 template override.
Definition chat_in.hpp:67

lloyal::chat_in::FormatInputs::enable_thinking
bool enable_thinking
Enable <think> blocks (pairs with reasoning_format)
Definition chat_in.hpp:73

lloyal::chat_in::FormatInputs::reasoning_format
std::string reasoning_format
"none" | "auto" | "deepseek" | "deepseek_legacy"
Definition chat_in.hpp:72

lloyal::chat_in::FormatInputs::tool_choice
std::string tool_choice
"auto" | "required" | "none"
Definition chat_in.hpp:70

lloyal::chat_in::FormatInputs::parallel_tool_calls
bool parallel_tool_calls
Allow parallel tool calls.
Definition chat_in.hpp:71

lloyal::chat_in::FormatResult
Result from chat template formatting with full format awareness.
Definition chat_in.hpp:84

lloyal::chat_in::FormatResult::grammar_triggers
std::vector< common_grammar_trigger > grammar_triggers
Triggers for lazy grammar activation.
Definition chat_in.hpp:94

lloyal::chat_in::FormatResult::reasoning_format
common_reasoning_format reasoning_format
Reasoning format for output parsing.
Definition chat_in.hpp:99

lloyal::chat_in::FormatResult::additional_stops
std::vector< std::string > additional_stops
Stop tokens extracted from template.
Definition chat_in.hpp:87

lloyal::chat_in::FormatResult::prompt
std::string prompt
Formatted prompt text ready for tokenization.
Definition chat_in.hpp:86

lloyal::chat_in::FormatResult::generation_prompt
std::string generation_prompt
Generation prompt prefill (e.g. "<think>")
Definition chat_in.hpp:93

lloyal::chat_in::FormatResult::grammar_lazy
bool grammar_lazy
Whether grammar should use lazy compilation.
Definition chat_in.hpp:92

lloyal::chat_in::FormatResult::grammar
std::string grammar
GBNF grammar for constrained sampling.
Definition chat_in.hpp:91

lloyal::chat_in::FormatResult::preserved_tokens
std::vector< std::string > preserved_tokens
Tokens to preserve during grammar constraining.
Definition chat_in.hpp:95

lloyal::chat_in::FormatResult::parser
std::string parser
PEG parser definition (for PEG formats)
Definition chat_in.hpp:96

lloyal::chat_in::FormatResult::format
common_chat_format format
Detected chat format.
Definition chat_in.hpp:90

tokenizer.hpp
Text Tokenization Operations.