liblloyal/helpers_8hpp_source.html

#pragma once


// SPDX-License-Identifier: Apache-2.0

// Copyright 2026 Lloyal Labs


#include "common.hpp"

#include "minja/chat-template.hpp"

#include "minja/minja.hpp"

#include <cassert>

#include <chrono>

#include <llama/ggml.h>

#include <llama/llama.h>

#include <lloyal/nlohmann/json.hpp>

#include <memory>

#include <sstream>

#include <string>

#include <vector>


// Forward declarations for detail namespace (defined at end of file)

namespace lloyal::detail {

std::string common_token_to_piece(const struct llama_vocab *vocab,

                                  llama_token token, bool special);

std::string get_token_safe(const llama_model *model, llama_token token);

const char *get_chatml_template();

std::string apply_chat_template_helper(const std::string &template_str,

                                       const nlohmann::ordered_json &messages,

                                       const std::string &bos_token,

                                       const std::string &eos_token,

                                       bool add_generation_prompt,

                                       bool add_bos, bool add_eos);

} // namespace lloyal::detail


namespace lloyal {


// Forward declare for chat template

using json = nlohmann::ordered_json;


// ===== BATCH UTILITIES =====


inline void batch_clear(llama_batch &batch) { batch.n_tokens = 0; }


inline void batch_add(llama_batch &batch, llama_token id, int32_t pos,

                      const std::vector<llama_seq_id> &seq_ids, bool logits,

                      int32_t capacity = -1) {

// Debug bounds checking to prevent buffer overflows

#ifdef DEBUG

  if (capacity > 0) {

    assert(batch.n_tokens < capacity && "batch_add: token capacity exceeded");

  }

#endif


  const auto i = batch.n_tokens;

  batch.token[i] = id;

  batch.pos[i] = pos;

  batch.n_seq_id[i] = static_cast<int32_t>(seq_ids.size());

  for (size_t j = 0; j < seq_ids.size(); ++j) {

    batch.seq_id[i][j] = seq_ids[j];

  }

  batch.logits[i] = logits ? 1 : 0;

  batch.n_tokens++;

}


// ===== CHAT TEMPLATE TYPES (PUBLIC API) =====


struct ChatTemplateResult {

  std::string prompt;

  std::vector<std::string> additional_stops;

};


inline std::string


format_chat_template_from_model(const llama_model *model,

                                const std::string &messages_json,

                                const std::string &template_override = "") {

  try {

    json messages = json::parse(messages_json);


    // Determine template source

    std::string template_str;

    if (!template_override.empty()) {

      template_str = template_override;

    } else if (model) {

      const char *model_template = llama_model_chat_template(model, nullptr);

      if (model_template && strlen(model_template) > 0) {

        template_str = model_template;

      }

    }


    if (template_str.empty()) {

      template_str = detail::get_chatml_template();

    }


    // Get BOS/EOS tokens and metadata from model

    std::string bos_token, eos_token;

    bool add_bos = false, add_eos = false;


    if (model) {

      const auto *vocab = llama_model_get_vocab(model);

      bos_token = detail::get_token_safe(model, llama_vocab_bos(vocab));

      eos_token = detail::get_token_safe(model, llama_vocab_eos(vocab));


      // Query GGUF metadata to determine if wrapper tokens should be stripped

      // (they'll be re-added during tokenization if the model expects them)

      add_bos = llama_vocab_get_add_bos(vocab);

      add_eos = llama_vocab_get_add_eos(vocab);

    }


    return detail::apply_chat_template_helper(template_str, messages, bos_token,

                                              eos_token, true, add_bos, add_eos);


  } catch (const std::exception &e) {

    return "";

  }

}


inline std::vector<std::string>


extract_template_stop_tokens(const llama_model *model,

                             const std::string &template_str) {

  std::vector<std::string> stops;


  if (!model)

    return stops;


  const auto *vocab = llama_model_get_vocab(model);

  if (!vocab)

    return stops;


  // Check what tokens actually exist in this model's vocabulary

  const auto get_token_if_exists =

      [&](const std::string &token_text) -> std::string {

    std::vector<llama_token> tokens(1);

    int n_tokens = llama_tokenize(vocab, token_text.c_str(),

                                  static_cast<int32_t>(token_text.length()),

                                  tokens.data(), 1, false, true);

    if (n_tokens == 1) {

      return token_text;

    }

    return "";

  };


  // For ChatML-style templates

  if (template_str.find("im_start") != std::string::npos) {

    auto token = get_token_if_exists("<|im_end|>");

    if (!token.empty())

      stops.push_back(token);


    token = get_token_if_exists("<|endoftext|>");

    if (!token.empty())

      stops.push_back(token);

  }


  // For Llama-3 style templates

  if (template_str.find("eom_id") != std::string::npos ||

      template_str.find("eot_id") != std::string::npos) {

    auto token = get_token_if_exists("<|eom_id|>");

    if (!token.empty())

      stops.push_back(token);


    token = get_token_if_exists("<|eot_id|>");

    if (!token.empty())

      stops.push_back(token);

  }


  // Always check for model's EOT token as fallback

  auto eot_token = llama_vocab_eot(vocab);

  if (eot_token != LLAMA_TOKEN_NULL) {

    std::string eot_text =

        detail::common_token_to_piece(vocab, eot_token, true);

    if (!eot_text.empty() &&

        std::find(stops.begin(), stops.end(), eot_text) == stops.end()) {

      stops.push_back(eot_text);

    }

  }


  return stops;

}


inline ChatTemplateResult


format_chat_template_complete(const llama_model *model,

                              const std::string &messages_json,

                              const std::string &template_override = "") {

  ChatTemplateResult result;


  try {

    json messages = json::parse(messages_json);


    std::string template_str;

    if (!template_override.empty()) {

      template_str = template_override;

    } else if (model) {

      const char *model_template = llama_model_chat_template(model, nullptr);

      if (model_template && strlen(model_template) > 0) {

        template_str = model_template;

      }

    }


    if (template_str.empty()) {

      template_str = detail::get_chatml_template();

    }


    std::string bos_token, eos_token;

    bool add_bos = false, add_eos = false;


    if (model) {

      const auto *vocab = llama_model_get_vocab(model);

      bos_token = detail::get_token_safe(model, llama_vocab_bos(vocab));

      eos_token = detail::get_token_safe(model, llama_vocab_eos(vocab));


      // Query GGUF metadata to determine if wrapper tokens should be stripped

      // (they'll be re-added during tokenization if the model expects them)

      add_bos = llama_vocab_get_add_bos(vocab);

      add_eos = llama_vocab_get_add_eos(vocab);

    }


    result.prompt = detail::apply_chat_template_helper(

        template_str, messages, bos_token, eos_token, true, add_bos, add_eos);

    result.additional_stops = extract_template_stop_tokens(model, template_str);


  } catch (const std::exception &e) {

    result.prompt = "";

    result.additional_stops.clear();

  }


  return result;

}


inline bool validate_chat_template_helper(const std::string &template_str) {

  try {

    minja::chat_template tmpl(template_str, "", "");

    return true;

  } catch (const std::exception &e) {

    return false;

  }

}


// ===== PARAMETER CONVERSION HELPERS =====


inline const std::vector<ggml_type> &get_kv_cache_types() {

  static const std::vector<ggml_type> types = {

      GGML_TYPE_F32,    GGML_TYPE_F16,  GGML_TYPE_BF16,

      GGML_TYPE_Q8_0,   GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,

      GGML_TYPE_IQ4_NL, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,

  };

  return types;

}


inline ggml_type kv_cache_type_from_str(const std::string &s) {

  const auto &kv_cache_types = get_kv_cache_types();

  for (const auto &type : kv_cache_types) {

    if (ggml_type_name(type) == s) {

      return type;

    }

  }

  throw std::runtime_error("Unsupported cache type: " + s);

}


inline bool is_truthy(const std::string &value) {

  return value == "on" || value == "enabled" || value == "1" || value == "true";

}


inline bool is_falsey(const std::string &value) {

  return value == "off" || value == "disabled" || value == "0" ||

         value == "false";

}


inline bool is_autoy(const std::string &value) {

  return value == "auto" || value == "-1";

}


// ===== STRING UTILITIES =====


// Repeat string n times


inline std::string string_repeat(const std::string &str, size_t n) {

  if (n == 0) {

    return "";

  }


  std::string result;

  result.reserve(str.length() * n);


  for (size_t i = 0; i < n; ++i) {

    result += str;

  }


  return result;

}


// Join strings with separator


inline std::string string_join(const std::vector<std::string> &values,

                               const std::string &separator) {

  std::ostringstream result;

  for (size_t i = 0; i < values.size(); ++i) {

    if (i > 0) {

      result << separator;

    }

    result << values[i];

  }

  return result.str();

}


// Split string by delimiter


inline std::vector<std::string> string_split(const std::string &str,

                                             const std::string &delimiter) {

  std::vector<std::string> parts;

  size_t start = 0;

  size_t end = str.find(delimiter);


  while (end != std::string::npos) {

    parts.push_back(str.substr(start, end - start));

    start = end + delimiter.length();

    end = str.find(delimiter, start);

  }


  parts.push_back(str.substr(start));


  return parts;

}


} // namespace lloyal


namespace lloyal::detail {


// ===== INTERNAL TOKEN HELPERS =====


// Token conversion helper


inline std::string common_token_to_piece(const struct llama_vocab *vocab,

                                         llama_token token, bool special) {

  std::string piece;

  piece.resize(

      piece.capacity()); // using string internal cache, 15 bytes + '\n'

  const int n_chars =

      llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);

  if (n_chars < 0) {

    piece.resize(-n_chars);

    int check =

        llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);

    assert(check == -n_chars);

  } else {

    piece.resize(n_chars);

  }

  return piece;

}


// Extract token from vocabulary safely


inline std::string get_token_safe(const llama_model *model, llama_token token) {

  if (!model || token == LLAMA_TOKEN_NULL) {

    return "";

  }

  const auto *vocab = llama_model_get_vocab(model);

  return common_token_to_piece(vocab, token, /* special */ true);

}


// ===== INTERNAL TEMPLATE HELPERS =====


// Default ChatML template fallback


inline const char *get_chatml_template() {

  return "{% for message in messages %}"

         "{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + "

         "'<|im_end|>' + '\\n'}}"

         "{% endfor %}"

         "{% if add_generation_prompt %}{{'<|im_start|>assistant\\n'}}{% endif "

         "%}";

}


// Apply chat template using minja engine (requires minja.hpp to be included

// first)

// Implements a round-trip pattern: template renders with wrapper tokens,

// then strips them conditionally based on metadata so they can be re-added

// during tokenization if the model expects them.


inline std::string apply_chat_template_helper(

    const std::string &template_str, const json &messages,

    const std::string &bos_token = "", const std::string &eos_token = "",

    bool add_generation_prompt = true, bool add_bos = false,

    bool add_eos = false) {

  try {

    // Create minja chat template with correct 3-parameter constructor

    minja::chat_template tmpl(template_str, bos_token, eos_token);


    // Prepare template inputs

    minja::chat_template_inputs inputs;

    inputs.messages = messages;

    inputs.tools = json::array(); // No tools for basic implementation

    inputs.add_generation_prompt = add_generation_prompt;

    inputs.now = std::chrono::system_clock::now();


    // Apply template with default options (use_bos_token=true, use_eos_token=true)

    // This ensures template variables like {{ bos_token }} and {{ eos_token }}

    // remain available for templates to use as delimiters between messages.

    minja::chat_template_options opts;

    auto result = tmpl.apply(inputs, opts);


    // Conditional wrapper token stripping

    // Only strip wrapper tokens at start/end if the model's metadata indicates

    // they will be re-added during tokenization. This prevents double-token issues

    // while keeping template variables available for use as delimiters.

    if (add_bos && !bos_token.empty() && result.starts_with(bos_token)) {

      result = result.substr(bos_token.length());

    }

    if (add_eos && !eos_token.empty() && result.ends_with(eos_token)) {

      result = result.substr(0, result.length() - eos_token.length());

    }


    return result;

  } catch (const std::exception &e) {

    return "";

  }

}


} // namespace lloyal::detail

chat-template.hpp

minja::chat_template
Definition chat-template.hpp:71

minja::chat_template::apply
std::string apply(const nlohmann::ordered_json &messages, const nlohmann::ordered_json &tools, bool add_generation_prompt, const nlohmann::ordered_json &extra_context=nlohmann::ordered_json(), bool apply_polyfills=true)
Definition chat-template.hpp:362

common.hpp

minja.hpp

lloyal::detail
Definition decoder.hpp:45

lloyal::detail::get_chatml_template
const char * get_chatml_template()
Definition helpers.hpp:509

lloyal::detail::apply_chat_template_helper
std::string apply_chat_template_helper(const std::string &template_str, const nlohmann::ordered_json &messages, const std::string &bos_token, const std::string &eos_token, bool add_generation_prompt, bool add_bos, bool add_eos)
Definition helpers.hpp:523

lloyal::detail::get_token_safe
std::string get_token_safe(const llama_model *model, llama_token token)
Definition helpers.hpp:498

lloyal::detail::common_token_to_piece
std::string common_token_to_piece(const struct llama_vocab *vocab, llama_token token, bool special)
Definition helpers.hpp:479

lloyal
JSON Schema to Grammar Converter (Header-Only)
Definition chat_template.hpp:27

lloyal::string_repeat
std::string string_repeat(const std::string &str, size_t n)
Definition helpers.hpp:426

lloyal::is_falsey
bool is_falsey(const std::string &value)
Check if string represents a falsey value.
Definition helpers.hpp:408

lloyal::format_chat_template_from_model
std::string format_chat_template_from_model(const llama_model *model, const std::string &messages_json, const std::string &template_override="")
Format chat messages using model's built-in template.
Definition helpers.hpp:140

lloyal::batch_clear
void batch_clear(llama_batch &batch)
Clear batch to empty state.
Definition helpers.hpp:64

lloyal::string_join
std::string string_join(const std::vector< std::string > &values, const std::string &separator)
Definition helpers.hpp:442

lloyal::format_chat_template_complete
ChatTemplateResult format_chat_template_complete(const llama_model *model, const std::string &messages_json, const std::string &template_override="")
Complete chat template processing with stop token detection.
Definition helpers.hpp:282

lloyal::validate_chat_template_helper
bool validate_chat_template_helper(const std::string &template_str)
Validate chat template syntax.
Definition helpers.hpp:341

lloyal::string_split
std::vector< std::string > string_split(const std::string &str, const std::string &delimiter)
Definition helpers.hpp:455

lloyal::extract_template_stop_tokens
std::vector< std::string > extract_template_stop_tokens(const llama_model *model, const std::string &template_str)
Dynamically detect stop tokens from chat template.
Definition helpers.hpp:204

lloyal::batch_add
void batch_add(llama_batch &batch, llama_token id, int32_t pos, const std::vector< llama_seq_id > &seq_ids, bool logits, int32_t capacity=-1)
Add single token to batch with position and sequence info.
Definition helpers.hpp:84

lloyal::is_autoy
bool is_autoy(const std::string &value)
Check if string represents an auto value.
Definition helpers.hpp:419

lloyal::is_truthy
bool is_truthy(const std::string &value)
Check if string represents a truthy value.
Definition helpers.hpp:398

lloyal::json
nlohmann::ordered_json json
Definition helpers.hpp:50

lloyal::kv_cache_type_from_str
ggml_type kv_cache_type_from_str(const std::string &s)
Convert cache type string to ggml_type enum.
Definition helpers.hpp:382

lloyal::get_kv_cache_types
const std::vector< ggml_type > & get_kv_cache_types()
Get list of supported KV cache types.
Definition helpers.hpp:363

lloyal::ChatTemplateResult
Result from complete chat template processing.
Definition helpers.hpp:113

lloyal::ChatTemplateResult::prompt
std::string prompt
Formatted chat prompt ready for tokenization.
Definition helpers.hpp:114

lloyal::ChatTemplateResult::additional_stops
std::vector< std::string > additional_stops
Template-specific stop tokens (e.g., "<|im_end|>", "<|eot_id|>")
Definition helpers.hpp:115

minja::chat_template_inputs
Definition chat-template.hpp:48

minja::chat_template_inputs::now
std::chrono::system_clock::time_point now
Definition chat-template.hpp:53

minja::chat_template_inputs::messages
nlohmann::ordered_json messages
Definition chat-template.hpp:49

minja::chat_template_inputs::add_generation_prompt
bool add_generation_prompt
Definition chat-template.hpp:51

minja::chat_template_inputs::tools
nlohmann::ordered_json tools
Definition chat-template.hpp:50

minja::chat_template_options
Definition chat-template.hpp:56