liblloyal 1.0.0
Composable primitives for llama.cpp inference
Loading...
Searching...
No Matches
tokenizer.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
6#include "common.hpp"
7#include <cstdint>
8#include <llama/llama.h>
9#include <string>
10#include <vector>
11
26
27// ===== TOKENIZATION (TEXT → TOKENS) =====
28
38inline std::vector<llama_token> tokenize(const llama_vocab *vocab,
39 const std::string &text,
40 bool add_special, bool parse_special) {
41 if (!vocab) {
42 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: vocab is null");
43 return {};
44 }
45
46 // Two-pass algorithm for safety:
47 // Pass 1: Determine required buffer size (negative return = size needed)
48 const int n_tokens =
49 -llama_tokenize(vocab, text.c_str(), static_cast<int32_t>(text.length()),
50 nullptr, // null buffer to get size
51 0, add_special, parse_special);
52
53 if (n_tokens <= 0) {
54 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] WARNING: Empty tokenization result "
55 "for text: '%.50s...'",
56 text.c_str());
57 return {};
58 }
59
60 // Pass 2: Actual tokenization
61 std::vector<llama_token> tokens(n_tokens);
62 const int n_tokenized =
63 llama_tokenize(vocab, text.c_str(), static_cast<int32_t>(text.length()),
64 tokens.data(), n_tokens, add_special, parse_special);
65
66 if (n_tokenized != n_tokens) {
67 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: Token count mismatch "
68 "(expected %d, got %d)",
69 n_tokens, n_tokenized);
70 return {};
71 }
72
73 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] Tokenized %zu bytes → %d tokens",
74 text.length(), n_tokens);
75 return tokens;
76}
77
78// ===== DETOKENIZATION (TOKENS → TEXT) =====
79
91inline std::string detokenize(const llama_vocab *vocab, llama_token token,
92 bool special) {
93 if (!vocab) {
94 LLOYAL_LOG_DEBUG("[tokenizer::detokenize] ERROR: vocab is null");
95 return "";
96 }
97
98 // Two-pass algorithm (vendored from llama.cpp/common/common.cpp)
99 std::string piece;
100 piece.resize(
101 piece.capacity()); // Use string's internal cache (15 bytes + '\0')
102
103 const int n_chars =
104 llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
105
106 if (n_chars < 0) {
107 // Buffer too small, resize and retry
108 piece.resize(-n_chars);
109 int check =
110 llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
111 if (check != -n_chars) {
113 "[tokenizer::detokenize] ERROR: Inconsistent sizing for token %d",
114 token);
115 return "";
116 }
117 } else {
118 piece.resize(n_chars);
119 }
120
121 return piece;
122}
123
137inline std::string detokenize_batch(const llama_vocab *vocab,
138 const llama_token *tokens, int32_t n_tokens,
139 bool remove_special, bool unparse_special) {
140 if (!vocab || !tokens || n_tokens <= 0) {
141 LLOYAL_LOG_DEBUG("[tokenizer::detokenize_batch] ERROR: Invalid parameters "
142 "(vocab=%p, tokens=%p, n_tokens=%d)",
143 vocab, tokens, n_tokens);
144 return "";
145 }
146
147 // Two-pass algorithm for safety:
148 // Pass 1: Determine required buffer size (negative return = size needed)
149 int32_t required_size = llama_detokenize(vocab, tokens, n_tokens,
150 nullptr, // null buffer to get size
151 0, remove_special, unparse_special);
152
153 if (required_size < 0) {
154 // Negative return means we need abs(required_size) bytes
155 required_size = -required_size;
156 }
157
158 if (required_size == 0) {
159 LLOYAL_LOG_DEBUG("[tokenizer::detokenize_batch] WARNING: Empty "
160 "detokenization result for %d tokens",
161 n_tokens);
162 return "";
163 }
164
165 // Pass 2: Actual detokenization
166 std::vector<char> buffer(required_size + 1); // +1 for null terminator
167 int32_t written =
168 llama_detokenize(vocab, tokens, n_tokens, buffer.data(), required_size,
169 remove_special, unparse_special);
170
171 if (written < 0) {
172 LLOYAL_LOG_DEBUG("[tokenizer::detokenize_batch] ERROR: Detokenization "
173 "failed (needed %d bytes, got %d)",
174 required_size, written);
175 return "";
176 }
177
178 std::string result(buffer.data(), written);
180 "[tokenizer::detokenize_batch] Detokenized %d tokens → %zu bytes",
181 n_tokens, result.size());
182 return result;
183}
184
185// ===== VOCABULARY QUERIES =====
186
196inline const llama_vocab *get_vocab(const llama_model *model) {
197 if (!model) {
198 LLOYAL_LOG_DEBUG("[tokenizer::get_vocab] ERROR: NULL model");
199 return nullptr;
200 }
201
202 const llama_vocab *vocab = llama_model_get_vocab(model);
203 if (!vocab) {
205 "[tokenizer::get_vocab] ERROR: llama_model_get_vocab returned NULL");
206 }
207
208 return vocab;
209}
210
218inline bool is_eog(const llama_vocab *vocab, llama_token token) {
219 if (!vocab) {
220 LLOYAL_LOG_DEBUG("[tokenizer::is_eog] ERROR: vocab is null");
221 return false;
222 }
223
224 return llama_vocab_is_eog(vocab, token);
225}
226
233inline int32_t vocab_size(const llama_vocab *vocab) {
234 if (!vocab) {
235 LLOYAL_LOG_DEBUG("[tokenizer::vocab_size] ERROR: vocab is null");
236 return 0;
237 }
238
239 return llama_vocab_n_tokens(vocab);
240}
241
242// ===== MODEL-ACCEPTING CONVENIENCE OVERLOADS =====
243//
244// These overloads accept llama_model* and handle vocab extraction + metadata
245// queries internally. They delegate to the vocab-accepting primitives above.
246//
247// Benefits:
248// - Eliminate boilerplate (vocab extraction, add_bos queries) in calling code
249// - Reduce code duplication across projects
250// - Backwards compatible - existing code unchanged
251
264inline std::vector<llama_token> tokenize(const llama_model *model,
265 const std::string &text) {
266 if (!model) {
267 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: model is null");
268 return {};
269 }
270
271 const llama_vocab *vocab = get_vocab(model);
272 if (!vocab) {
273 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: get_vocab returned null");
274 return {};
275 }
276
277 bool add_bos = llama_vocab_get_add_bos(vocab);
278 return tokenize(vocab, text, add_bos, true);
279}
280
289inline std::string detokenize(const llama_model *model, llama_token token,
290 bool special = true) {
291 if (!model) {
292 LLOYAL_LOG_DEBUG("[tokenizer::detokenize] ERROR: model is null");
293 return "";
294 }
295
296 const llama_vocab *vocab = get_vocab(model);
297 if (!vocab) {
298 LLOYAL_LOG_DEBUG("[tokenizer::detokenize] ERROR: get_vocab returned null");
299 return "";
300 }
301
302 return detokenize(vocab, token, special);
303}
304
316inline std::string detokenize_batch(const llama_model *model,
317 const std::vector<llama_token> &tokens,
318 bool remove_special = false,
319 bool unparse_special = true) {
320 if (!model) {
322 "[tokenizer::detokenize_batch] ERROR: model is null");
323 return "";
324 }
325
326 const llama_vocab *vocab = get_vocab(model);
327 if (!vocab) {
329 "[tokenizer::detokenize_batch] ERROR: get_vocab returned null");
330 return "";
331 }
332
333 return detokenize_batch(vocab, tokens.data(),
334 static_cast<int32_t>(tokens.size()), remove_special,
335 unparse_special);
336}
337
348inline std::string detokenize_batch(const llama_model *model,
349 const llama_token *tokens, int32_t n_tokens,
350 bool remove_special, bool unparse_special) {
351 if (!model) {
353 "[tokenizer::detokenize_batch] ERROR: model is null");
354 return "";
355 }
356
357 const llama_vocab *vocab = get_vocab(model);
358 if (!vocab) {
360 "[tokenizer::detokenize_batch] ERROR: get_vocab returned null");
361 return "";
362 }
363
364 return detokenize_batch(vocab, tokens, n_tokens, remove_special,
365 unparse_special);
366}
367
375inline bool is_eog(const llama_model *model, llama_token token) {
376 if (!model) {
377 LLOYAL_LOG_DEBUG("[tokenizer::is_eog] ERROR: model is null");
378 return false;
379 }
380
381 const llama_vocab *vocab = get_vocab(model);
382 if (!vocab) {
383 LLOYAL_LOG_DEBUG("[tokenizer::is_eog] ERROR: get_vocab returned null");
384 return false;
385 }
386
387 return is_eog(vocab, token);
388}
389
396inline int32_t vocab_size(const llama_model *model) {
397 if (!model) {
398 LLOYAL_LOG_DEBUG("[tokenizer::vocab_size] ERROR: model is null");
399 return 0;
400 }
401
402 const llama_vocab *vocab = get_vocab(model);
403 if (!vocab) {
404 LLOYAL_LOG_DEBUG("[tokenizer::vocab_size] ERROR: get_vocab returned null");
405 return 0;
406 }
407
408 return vocab_size(vocab);
409}
410
411} // namespace lloyal::tokenizer
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:47
int32_t vocab_size(const llama_vocab *vocab)
Get vocabulary size (total number of tokens)
std::vector< llama_token > tokenize(const llama_vocab *vocab, const std::string &text, bool add_special, bool parse_special)
Tokenize text to token array.
Definition tokenizer.hpp:38
std::string detokenize_batch(const llama_vocab *vocab, const llama_token *tokens, int32_t n_tokens, bool remove_special, bool unparse_special)
Detokenize TOKEN ARRAY to text (reconstruction use case)
const llama_vocab * get_vocab(const llama_model *model)
Get vocabulary from model.
std::string detokenize(const llama_vocab *vocab, llama_token token, bool special)
Detokenize SINGLE token to text (streaming use case)
Definition tokenizer.hpp:91
bool is_eog(const llama_vocab *vocab, llama_token token)
Check if token is end-of-generation marker.