liblloyal 1.0.0
Composable primitives for llama.cpp inference
Loading...
Searching...
No Matches
helpers.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
6#include "common.hpp"
8#include "minja/minja.hpp"
9#include <cassert>
10#include <chrono>
11#include <llama/ggml.h>
12#include <llama/llama.h>
13#include <lloyal/nlohmann/json.hpp>
14#include <memory>
15#include <sstream>
16#include <string>
17#include <vector>
18
33// Forward declarations for detail namespace (defined at end of file)
34namespace lloyal::detail {
35std::string common_token_to_piece(const struct llama_vocab *vocab,
36 llama_token token, bool special);
37std::string get_token_safe(const llama_model *model, llama_token token);
38const char *get_chatml_template();
39std::string apply_chat_template_helper(const std::string &template_str,
40 const nlohmann::ordered_json &messages,
41 const std::string &bos_token,
42 const std::string &eos_token,
43 bool add_generation_prompt,
44 bool add_bos, bool add_eos);
45} // namespace lloyal::detail
46
47namespace lloyal {
48
49// Forward declare for chat template
50using json = nlohmann::ordered_json;
51
52// ===== BATCH UTILITIES =====
53
64inline void batch_clear(llama_batch &batch) { batch.n_tokens = 0; }
65
84inline void batch_add(llama_batch &batch, llama_token id, int32_t pos,
85 const std::vector<llama_seq_id> &seq_ids, bool logits,
86 int32_t capacity = -1) {
87// Debug bounds checking to prevent buffer overflows
88#ifdef DEBUG
89 if (capacity > 0) {
90 assert(batch.n_tokens < capacity && "batch_add: token capacity exceeded");
91 }
92#endif
93
94 const auto i = batch.n_tokens;
95 batch.token[i] = id;
96 batch.pos[i] = pos;
97 batch.n_seq_id[i] = static_cast<int32_t>(seq_ids.size());
98 for (size_t j = 0; j < seq_ids.size(); ++j) {
99 batch.seq_id[i][j] = seq_ids[j];
100 }
101 batch.logits[i] = logits ? 1 : 0;
102 batch.n_tokens++;
103}
104
105// ===== CHAT TEMPLATE TYPES (PUBLIC API) =====
106
114 std::string prompt;
115 std::vector<std::string> additional_stops;
116};
117
139inline std::string
140format_chat_template_from_model(const llama_model *model,
141 const std::string &messages_json,
142 const std::string &template_override = "") {
143 try {
144 json messages = json::parse(messages_json);
145
146 // Determine template source
147 std::string template_str;
148 if (!template_override.empty()) {
149 template_str = template_override;
150 } else if (model) {
151 const char *model_template = llama_model_chat_template(model, nullptr);
152 if (model_template && strlen(model_template) > 0) {
153 template_str = model_template;
154 }
155 }
156
157 if (template_str.empty()) {
158 template_str = detail::get_chatml_template();
159 }
160
161 // Get BOS/EOS tokens and metadata from model
162 std::string bos_token, eos_token;
163 bool add_bos = false, add_eos = false;
164
165 if (model) {
166 const auto *vocab = llama_model_get_vocab(model);
167 bos_token = detail::get_token_safe(model, llama_vocab_bos(vocab));
168 eos_token = detail::get_token_safe(model, llama_vocab_eos(vocab));
169
170 // Query GGUF metadata to determine if wrapper tokens should be stripped
171 // (they'll be re-added during tokenization if the model expects them)
172 add_bos = llama_vocab_get_add_bos(vocab);
173 add_eos = llama_vocab_get_add_eos(vocab);
174 }
175
176 return detail::apply_chat_template_helper(template_str, messages, bos_token,
177 eos_token, true, add_bos, add_eos);
178
179 } catch (const std::exception &e) {
180 return "";
181 }
182}
183
203inline std::vector<std::string>
204extract_template_stop_tokens(const llama_model *model,
205 const std::string &template_str) {
206 std::vector<std::string> stops;
207
208 if (!model)
209 return stops;
210
211 const auto *vocab = llama_model_get_vocab(model);
212 if (!vocab)
213 return stops;
214
215 // Check what tokens actually exist in this model's vocabulary
216 const auto get_token_if_exists =
217 [&](const std::string &token_text) -> std::string {
218 std::vector<llama_token> tokens(1);
219 int n_tokens = llama_tokenize(vocab, token_text.c_str(),
220 static_cast<int32_t>(token_text.length()),
221 tokens.data(), 1, false, true);
222 if (n_tokens == 1) {
223 return token_text;
224 }
225 return "";
226 };
227
228 // For ChatML-style templates
229 if (template_str.find("im_start") != std::string::npos) {
230 auto token = get_token_if_exists("<|im_end|>");
231 if (!token.empty())
232 stops.push_back(token);
233
234 token = get_token_if_exists("<|endoftext|>");
235 if (!token.empty())
236 stops.push_back(token);
237 }
238
239 // For Llama-3 style templates
240 if (template_str.find("eom_id") != std::string::npos ||
241 template_str.find("eot_id") != std::string::npos) {
242 auto token = get_token_if_exists("<|eom_id|>");
243 if (!token.empty())
244 stops.push_back(token);
245
246 token = get_token_if_exists("<|eot_id|>");
247 if (!token.empty())
248 stops.push_back(token);
249 }
250
251 // Always check for model's EOT token as fallback
252 auto eot_token = llama_vocab_eot(vocab);
253 if (eot_token != LLAMA_TOKEN_NULL) {
254 std::string eot_text =
255 detail::common_token_to_piece(vocab, eot_token, true);
256 if (!eot_text.empty() &&
257 std::find(stops.begin(), stops.end(), eot_text) == stops.end()) {
258 stops.push_back(eot_text);
259 }
260 }
261
262 return stops;
263}
264
281inline ChatTemplateResult
282format_chat_template_complete(const llama_model *model,
283 const std::string &messages_json,
284 const std::string &template_override = "") {
285 ChatTemplateResult result;
286
287 try {
288 json messages = json::parse(messages_json);
289
290 std::string template_str;
291 if (!template_override.empty()) {
292 template_str = template_override;
293 } else if (model) {
294 const char *model_template = llama_model_chat_template(model, nullptr);
295 if (model_template && strlen(model_template) > 0) {
296 template_str = model_template;
297 }
298 }
299
300 if (template_str.empty()) {
301 template_str = detail::get_chatml_template();
302 }
303
304 std::string bos_token, eos_token;
305 bool add_bos = false, add_eos = false;
306
307 if (model) {
308 const auto *vocab = llama_model_get_vocab(model);
309 bos_token = detail::get_token_safe(model, llama_vocab_bos(vocab));
310 eos_token = detail::get_token_safe(model, llama_vocab_eos(vocab));
311
312 // Query GGUF metadata to determine if wrapper tokens should be stripped
313 // (they'll be re-added during tokenization if the model expects them)
314 add_bos = llama_vocab_get_add_bos(vocab);
315 add_eos = llama_vocab_get_add_eos(vocab);
316 }
317
319 template_str, messages, bos_token, eos_token, true, add_bos, add_eos);
320 result.additional_stops = extract_template_stop_tokens(model, template_str);
321
322 } catch (const std::exception &e) {
323 result.prompt = "";
324 result.additional_stops.clear();
325 }
326
327 return result;
328}
329
341inline bool validate_chat_template_helper(const std::string &template_str) {
342 try {
343 minja::chat_template tmpl(template_str, "", "");
344 return true;
345 } catch (const std::exception &e) {
346 return false;
347 }
348}
349
350// ===== PARAMETER CONVERSION HELPERS =====
351
363inline const std::vector<ggml_type> &get_kv_cache_types() {
364 static const std::vector<ggml_type> types = {
365 GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
366 GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
367 GGML_TYPE_IQ4_NL, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
368 };
369 return types;
370}
371
382inline ggml_type kv_cache_type_from_str(const std::string &s) {
383 const auto &kv_cache_types = get_kv_cache_types();
384 for (const auto &type : kv_cache_types) {
385 if (ggml_type_name(type) == s) {
386 return type;
387 }
388 }
389 throw std::runtime_error("Unsupported cache type: " + s);
390}
391
398inline bool is_truthy(const std::string &value) {
399 return value == "on" || value == "enabled" || value == "1" || value == "true";
400}
401
408inline bool is_falsey(const std::string &value) {
409 return value == "off" || value == "disabled" || value == "0" ||
410 value == "false";
411}
412
419inline bool is_autoy(const std::string &value) {
420 return value == "auto" || value == "-1";
421}
422
423// ===== STRING UTILITIES =====
424
425// Repeat string n times
426inline std::string string_repeat(const std::string &str, size_t n) {
427 if (n == 0) {
428 return "";
429 }
430
431 std::string result;
432 result.reserve(str.length() * n);
433
434 for (size_t i = 0; i < n; ++i) {
435 result += str;
436 }
437
438 return result;
439}
440
441// Join strings with separator
442inline std::string string_join(const std::vector<std::string> &values,
443 const std::string &separator) {
444 std::ostringstream result;
445 for (size_t i = 0; i < values.size(); ++i) {
446 if (i > 0) {
447 result << separator;
448 }
449 result << values[i];
450 }
451 return result.str();
452}
453
454// Split string by delimiter
455inline std::vector<std::string> string_split(const std::string &str,
456 const std::string &delimiter) {
457 std::vector<std::string> parts;
458 size_t start = 0;
459 size_t end = str.find(delimiter);
460
461 while (end != std::string::npos) {
462 parts.push_back(str.substr(start, end - start));
463 start = end + delimiter.length();
464 end = str.find(delimiter, start);
465 }
466
467 parts.push_back(str.substr(start));
468
469 return parts;
470}
471
472} // namespace lloyal
473
474namespace lloyal::detail {
475
476// ===== INTERNAL TOKEN HELPERS =====
477
478// Token conversion helper
479inline std::string common_token_to_piece(const struct llama_vocab *vocab,
480 llama_token token, bool special) {
481 std::string piece;
482 piece.resize(
483 piece.capacity()); // using string internal cache, 15 bytes + '\n'
484 const int n_chars =
485 llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
486 if (n_chars < 0) {
487 piece.resize(-n_chars);
488 int check =
489 llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
490 assert(check == -n_chars);
491 } else {
492 piece.resize(n_chars);
493 }
494 return piece;
495}
496
497// Extract token from vocabulary safely
498inline std::string get_token_safe(const llama_model *model, llama_token token) {
499 if (!model || token == LLAMA_TOKEN_NULL) {
500 return "";
501 }
502 const auto *vocab = llama_model_get_vocab(model);
503 return common_token_to_piece(vocab, token, /* special */ true);
504}
505
506// ===== INTERNAL TEMPLATE HELPERS =====
507
508// Default ChatML template fallback
509inline const char *get_chatml_template() {
510 return "{% for message in messages %}"
511 "{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + "
512 "'<|im_end|>' + '\\n'}}"
513 "{% endfor %}"
514 "{% if add_generation_prompt %}{{'<|im_start|>assistant\\n'}}{% endif "
515 "%}";
516}
517
518// Apply chat template using minja engine (requires minja.hpp to be included
519// first)
520// Implements a round-trip pattern: template renders with wrapper tokens,
521// then strips them conditionally based on metadata so they can be re-added
522// during tokenization if the model expects them.
523inline std::string apply_chat_template_helper(
524 const std::string &template_str, const json &messages,
525 const std::string &bos_token = "", const std::string &eos_token = "",
526 bool add_generation_prompt = true, bool add_bos = false,
527 bool add_eos = false) {
528 try {
529 // Create minja chat template with correct 3-parameter constructor
530 minja::chat_template tmpl(template_str, bos_token, eos_token);
531
532 // Prepare template inputs
534 inputs.messages = messages;
535 inputs.tools = json::array(); // No tools for basic implementation
536 inputs.add_generation_prompt = add_generation_prompt;
537 inputs.now = std::chrono::system_clock::now();
538
539 // Apply template with default options (use_bos_token=true, use_eos_token=true)
540 // This ensures template variables like {{ bos_token }} and {{ eos_token }}
541 // remain available for templates to use as delimiters between messages.
543 auto result = tmpl.apply(inputs, opts);
544
545 // Conditional wrapper token stripping
546 // Only strip wrapper tokens at start/end if the model's metadata indicates
547 // they will be re-added during tokenization. This prevents double-token issues
548 // while keeping template variables available for use as delimiters.
549 if (add_bos && !bos_token.empty() && result.starts_with(bos_token)) {
550 result = result.substr(bos_token.length());
551 }
552 if (add_eos && !eos_token.empty() && result.ends_with(eos_token)) {
553 result = result.substr(0, result.length() - eos_token.length());
554 }
555
556 return result;
557 } catch (const std::exception &e) {
558 return "";
559 }
560}
561
562} // namespace lloyal::detail
std::string apply(const nlohmann::ordered_json &messages, const nlohmann::ordered_json &tools, bool add_generation_prompt, const nlohmann::ordered_json &extra_context=nlohmann::ordered_json(), bool apply_polyfills=true)
const char * get_chatml_template()
Definition helpers.hpp:509
std::string apply_chat_template_helper(const std::string &template_str, const nlohmann::ordered_json &messages, const std::string &bos_token, const std::string &eos_token, bool add_generation_prompt, bool add_bos, bool add_eos)
Definition helpers.hpp:523
std::string get_token_safe(const llama_model *model, llama_token token)
Definition helpers.hpp:498
std::string common_token_to_piece(const struct llama_vocab *vocab, llama_token token, bool special)
Definition helpers.hpp:479
JSON Schema to Grammar Converter (Header-Only)
std::string string_repeat(const std::string &str, size_t n)
Definition helpers.hpp:426
bool is_falsey(const std::string &value)
Check if string represents a falsey value.
Definition helpers.hpp:408
std::string format_chat_template_from_model(const llama_model *model, const std::string &messages_json, const std::string &template_override="")
Format chat messages using model's built-in template.
Definition helpers.hpp:140
void batch_clear(llama_batch &batch)
Clear batch to empty state.
Definition helpers.hpp:64
std::string string_join(const std::vector< std::string > &values, const std::string &separator)
Definition helpers.hpp:442
ChatTemplateResult format_chat_template_complete(const llama_model *model, const std::string &messages_json, const std::string &template_override="")
Complete chat template processing with stop token detection.
Definition helpers.hpp:282
bool validate_chat_template_helper(const std::string &template_str)
Validate chat template syntax.
Definition helpers.hpp:341
std::vector< std::string > string_split(const std::string &str, const std::string &delimiter)
Definition helpers.hpp:455
std::vector< std::string > extract_template_stop_tokens(const llama_model *model, const std::string &template_str)
Dynamically detect stop tokens from chat template.
Definition helpers.hpp:204
void batch_add(llama_batch &batch, llama_token id, int32_t pos, const std::vector< llama_seq_id > &seq_ids, bool logits, int32_t capacity=-1)
Add single token to batch with position and sequence info.
Definition helpers.hpp:84
bool is_autoy(const std::string &value)
Check if string represents an auto value.
Definition helpers.hpp:419
bool is_truthy(const std::string &value)
Check if string represents a truthy value.
Definition helpers.hpp:398
nlohmann::ordered_json json
Definition helpers.hpp:50
ggml_type kv_cache_type_from_str(const std::string &s)
Convert cache type string to ggml_type enum.
Definition helpers.hpp:382
const std::vector< ggml_type > & get_kv_cache_types()
Get list of supported KV cache types.
Definition helpers.hpp:363
Result from complete chat template processing.
Definition helpers.hpp:113
std::string prompt
Formatted chat prompt ready for tokenization.
Definition helpers.hpp:114
std::vector< std::string > additional_stops
Template-specific stop tokens (e.g., "<|im_end|>", "<|eot_id|>")
Definition helpers.hpp:115
std::chrono::system_clock::time_point now
nlohmann::ordered_json messages
nlohmann::ordered_json tools