11#include <llama/ggml.h>
12#include <llama/llama.h>
13#include <lloyal/nlohmann/json.hpp>
36 llama_token token,
bool special);
37std::string
get_token_safe(
const llama_model *model, llama_token token);
40 const nlohmann::ordered_json &messages,
41 const std::string &bos_token,
42 const std::string &eos_token,
43 bool add_generation_prompt,
44 bool add_bos,
bool add_eos);
50using json = nlohmann::ordered_json;
64inline void batch_clear(llama_batch &batch) { batch.n_tokens = 0; }
84inline void batch_add(llama_batch &batch, llama_token
id, int32_t pos,
85 const std::vector<llama_seq_id> &seq_ids,
bool logits,
86 int32_t capacity = -1) {
90 assert(batch.n_tokens < capacity &&
"batch_add: token capacity exceeded");
94 const auto i = batch.n_tokens;
97 batch.n_seq_id[i] =
static_cast<int32_t
>(seq_ids.size());
98 for (
size_t j = 0; j < seq_ids.size(); ++j) {
99 batch.seq_id[i][j] = seq_ids[j];
101 batch.logits[i] = logits ? 1 : 0;
141 const std::string &messages_json,
142 const std::string &template_override =
"") {
144 json messages = json::parse(messages_json);
147 std::string template_str;
148 if (!template_override.empty()) {
149 template_str = template_override;
151 const char *model_template = llama_model_chat_template(model,
nullptr);
152 if (model_template && strlen(model_template) > 0) {
153 template_str = model_template;
157 if (template_str.empty()) {
162 std::string bos_token, eos_token;
163 bool add_bos =
false, add_eos =
false;
166 const auto *vocab = llama_model_get_vocab(model);
172 add_bos = llama_vocab_get_add_bos(vocab);
173 add_eos = llama_vocab_get_add_eos(vocab);
177 eos_token,
true, add_bos, add_eos);
179 }
catch (
const std::exception &e) {
203inline std::vector<std::string>
205 const std::string &template_str) {
206 std::vector<std::string> stops;
211 const auto *vocab = llama_model_get_vocab(model);
216 const auto get_token_if_exists =
217 [&](
const std::string &token_text) -> std::string {
218 std::vector<llama_token> tokens(1);
219 int n_tokens = llama_tokenize(vocab, token_text.c_str(),
220 static_cast<int32_t
>(token_text.length()),
221 tokens.data(), 1,
false,
true);
229 if (template_str.find(
"im_start") != std::string::npos) {
230 auto token = get_token_if_exists(
"<|im_end|>");
232 stops.push_back(token);
234 token = get_token_if_exists(
"<|endoftext|>");
236 stops.push_back(token);
240 if (template_str.find(
"eom_id") != std::string::npos ||
241 template_str.find(
"eot_id") != std::string::npos) {
242 auto token = get_token_if_exists(
"<|eom_id|>");
244 stops.push_back(token);
246 token = get_token_if_exists(
"<|eot_id|>");
248 stops.push_back(token);
252 auto eot_token = llama_vocab_eot(vocab);
253 if (eot_token != LLAMA_TOKEN_NULL) {
254 std::string eot_text =
256 if (!eot_text.empty() &&
257 std::find(stops.begin(), stops.end(), eot_text) == stops.end()) {
258 stops.push_back(eot_text);
281inline ChatTemplateResult
283 const std::string &messages_json,
284 const std::string &template_override =
"") {
288 json messages = json::parse(messages_json);
290 std::string template_str;
291 if (!template_override.empty()) {
292 template_str = template_override;
294 const char *model_template = llama_model_chat_template(model,
nullptr);
295 if (model_template && strlen(model_template) > 0) {
296 template_str = model_template;
300 if (template_str.empty()) {
304 std::string bos_token, eos_token;
305 bool add_bos =
false, add_eos =
false;
308 const auto *vocab = llama_model_get_vocab(model);
314 add_bos = llama_vocab_get_add_bos(vocab);
315 add_eos = llama_vocab_get_add_eos(vocab);
319 template_str, messages, bos_token, eos_token,
true, add_bos, add_eos);
322 }
catch (
const std::exception &e) {
345 }
catch (
const std::exception &e) {
364 static const std::vector<ggml_type> types = {
365 GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
366 GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
367 GGML_TYPE_IQ4_NL, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
384 for (
const auto &type : kv_cache_types) {
385 if (ggml_type_name(type) == s) {
389 throw std::runtime_error(
"Unsupported cache type: " + s);
399 return value ==
"on" || value ==
"enabled" || value ==
"1" || value ==
"true";
409 return value ==
"off" || value ==
"disabled" || value ==
"0" ||
420 return value ==
"auto" || value ==
"-1";
432 result.reserve(str.length() * n);
434 for (
size_t i = 0; i < n; ++i) {
442inline std::string
string_join(
const std::vector<std::string> &values,
443 const std::string &separator) {
444 std::ostringstream result;
445 for (
size_t i = 0; i < values.size(); ++i) {
456 const std::string &delimiter) {
457 std::vector<std::string> parts;
459 size_t end = str.find(delimiter);
461 while (end != std::string::npos) {
462 parts.push_back(str.substr(start, end - start));
463 start = end + delimiter.length();
464 end = str.find(delimiter, start);
467 parts.push_back(str.substr(start));
480 llama_token token,
bool special) {
485 llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
487 piece.resize(-n_chars);
489 llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
490 assert(check == -n_chars);
492 piece.resize(n_chars);
499 if (!model || token == LLAMA_TOKEN_NULL) {
502 const auto *vocab = llama_model_get_vocab(model);
510 return "{% for message in messages %}"
511 "{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + "
512 "'<|im_end|>' + '\\n'}}"
514 "{% if add_generation_prompt %}{{'<|im_start|>assistant\\n'}}{% endif "
524 const std::string &template_str,
const json &messages,
525 const std::string &bos_token =
"",
const std::string &eos_token =
"",
526 bool add_generation_prompt =
true,
bool add_bos =
false,
527 bool add_eos =
false) {
535 inputs.
tools = json::array();
537 inputs.
now = std::chrono::system_clock::now();
543 auto result = tmpl.
apply(inputs, opts);
549 if (add_bos && !bos_token.empty() && result.starts_with(bos_token)) {
550 result = result.substr(bos_token.length());
552 if (add_eos && !eos_token.empty() && result.ends_with(eos_token)) {
553 result = result.substr(0, result.length() - eos_token.length());
557 }
catch (
const std::exception &e) {
std::string apply(const nlohmann::ordered_json &messages, const nlohmann::ordered_json &tools, bool add_generation_prompt, const nlohmann::ordered_json &extra_context=nlohmann::ordered_json(), bool apply_polyfills=true)
const char * get_chatml_template()
std::string apply_chat_template_helper(const std::string &template_str, const nlohmann::ordered_json &messages, const std::string &bos_token, const std::string &eos_token, bool add_generation_prompt, bool add_bos, bool add_eos)
std::string get_token_safe(const llama_model *model, llama_token token)
std::string common_token_to_piece(const struct llama_vocab *vocab, llama_token token, bool special)
JSON Schema to Grammar Converter (Header-Only)
std::string string_repeat(const std::string &str, size_t n)
bool is_falsey(const std::string &value)
Check if string represents a falsey value.
std::string format_chat_template_from_model(const llama_model *model, const std::string &messages_json, const std::string &template_override="")
Format chat messages using model's built-in template.
void batch_clear(llama_batch &batch)
Clear batch to empty state.
std::string string_join(const std::vector< std::string > &values, const std::string &separator)
ChatTemplateResult format_chat_template_complete(const llama_model *model, const std::string &messages_json, const std::string &template_override="")
Complete chat template processing with stop token detection.
bool validate_chat_template_helper(const std::string &template_str)
Validate chat template syntax.
std::vector< std::string > string_split(const std::string &str, const std::string &delimiter)
std::vector< std::string > extract_template_stop_tokens(const llama_model *model, const std::string &template_str)
Dynamically detect stop tokens from chat template.
void batch_add(llama_batch &batch, llama_token id, int32_t pos, const std::vector< llama_seq_id > &seq_ids, bool logits, int32_t capacity=-1)
Add single token to batch with position and sequence info.
bool is_autoy(const std::string &value)
Check if string represents an auto value.
bool is_truthy(const std::string &value)
Check if string represents a truthy value.
nlohmann::ordered_json json
ggml_type kv_cache_type_from_str(const std::string &s)
Convert cache type string to ggml_type enum.
const std::vector< ggml_type > & get_kv_cache_types()
Get list of supported KV cache types.
Result from complete chat template processing.
std::string prompt
Formatted chat prompt ready for tokenization.
std::vector< std::string > additional_stops
Template-specific stop tokens (e.g., "<|im_end|>", "<|eot_id|>")