liblloyal 1.0.0
Branched Inference for llama.cpp
Loading...
Searching...
No Matches
tokenizer.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
6
7#include "common.hpp"
8#include <cstdint>
9#include <llama/llama.h>
10#include <string>
11#include <vector>
12
27
28// ===== TOKENIZATION (TEXT → TOKENS) =====
29
39inline std::vector<llama_token> tokenize(const llama_vocab *vocab,
40 const std::string &text,
41 bool add_special, bool parse_special) {
42 if (!vocab) {
43 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: vocab is null");
44 return {};
45 }
46
47 // Two-pass algorithm for safety:
48 // Pass 1: Determine required buffer size (negative return = size needed)
49 const int n_tokens =
50 -llama_tokenize(vocab, text.c_str(), static_cast<int32_t>(text.length()),
51 nullptr, // null buffer to get size
52 0, add_special, parse_special);
53
54 if (n_tokens <= 0) {
55 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] WARNING: Empty tokenization result "
56 "for text: '%.50s...'",
57 text.c_str());
58 return {};
59 }
60
61 // Pass 2: Actual tokenization
62 std::vector<llama_token> tokens(n_tokens);
63 const int n_tokenized =
64 llama_tokenize(vocab, text.c_str(), static_cast<int32_t>(text.length()),
65 tokens.data(), n_tokens, add_special, parse_special);
66
67 if (n_tokenized != n_tokens) {
68 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: Token count mismatch "
69 "(expected %d, got %d)",
70 n_tokens, n_tokenized);
71 return {};
72 }
73
74 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] Tokenized %zu bytes → %d tokens",
75 text.length(), n_tokens);
76 return tokens;
77}
78
79// ===== DETOKENIZATION (TOKENS → TEXT) =====
80
92inline std::string detokenize(const llama_vocab *vocab, llama_token token,
93 bool special) {
94 if (!vocab) {
95 LLOYAL_LOG_DEBUG("[tokenizer::detokenize] ERROR: vocab is null");
96 return "";
97 }
98
99 // Two-pass algorithm (vendored from llama.cpp/common/common.cpp)
100 std::string piece(64, '\0');
101
102 const int n_chars =
103 llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
104
105 if (n_chars < 0) {
106 // Buffer too small, resize and retry
107 piece.resize(-n_chars);
108 int check =
109 llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
110 if (check != -n_chars) {
112 "[tokenizer::detokenize] ERROR: Inconsistent sizing for token %d",
113 token);
114 return "";
115 }
116 } else {
117 piece.resize(n_chars);
118 }
119
120 return piece;
121}
122
136inline std::string detokenize_batch(const llama_vocab *vocab,
137 const llama_token *tokens, int32_t n_tokens,
138 bool remove_special, bool unparse_special) {
139 if (!vocab || !tokens || n_tokens <= 0) {
140 LLOYAL_LOG_DEBUG("[tokenizer::detokenize_batch] ERROR: Invalid parameters "
141 "(vocab=%p, tokens=%p, n_tokens=%d)",
142 vocab, tokens, n_tokens);
143 return "";
144 }
145
146 // Two-pass algorithm for safety:
147 // Pass 1: Determine required buffer size (negative return = size needed)
148 int32_t required_size = llama_detokenize(vocab, tokens, n_tokens,
149 nullptr, // null buffer to get size
150 0, remove_special, unparse_special);
151
152 if (required_size < 0) {
153 // Negative return means we need abs(required_size) bytes
154 required_size = -required_size;
155 }
156
157 if (required_size == 0) {
158 LLOYAL_LOG_DEBUG("[tokenizer::detokenize_batch] WARNING: Empty "
159 "detokenization result for %d tokens",
160 n_tokens);
161 return "";
162 }
163
164 // Pass 2: Actual detokenization
165 std::vector<char> buffer(required_size + 1); // +1 for null terminator
166 int32_t written =
167 llama_detokenize(vocab, tokens, n_tokens, buffer.data(), required_size,
168 remove_special, unparse_special);
169
170 if (written < 0) {
171 LLOYAL_LOG_DEBUG("[tokenizer::detokenize_batch] ERROR: Detokenization "
172 "failed (needed %d bytes, got %d)",
173 required_size, written);
174 return "";
175 }
176
177 std::string result(buffer.data(), written);
179 "[tokenizer::detokenize_batch] Detokenized %d tokens → %zu bytes",
180 n_tokens, result.size());
181 return result;
182}
183
184// ===== VOCABULARY QUERIES =====
185
195inline const llama_vocab *get_vocab(const llama_model *model) {
196 if (!model) {
197 LLOYAL_LOG_DEBUG("[tokenizer::get_vocab] ERROR: NULL model");
198 return nullptr;
199 }
200
201 const llama_vocab *vocab = llama_model_get_vocab(model);
202 if (!vocab) {
204 "[tokenizer::get_vocab] ERROR: llama_model_get_vocab returned NULL");
205 }
206
207 return vocab;
208}
209
217inline bool is_eog(const llama_vocab *vocab, llama_token token) {
218 if (!vocab) {
219 LLOYAL_LOG_DEBUG("[tokenizer::is_eog] ERROR: vocab is null");
220 return false;
221 }
222
223 return llama_vocab_is_eog(vocab, token);
224}
225
232inline int32_t vocab_size(const llama_vocab *vocab) {
233 if (!vocab) {
234 LLOYAL_LOG_DEBUG("[tokenizer::vocab_size] ERROR: vocab is null");
235 return 0;
236 }
237
238 return llama_vocab_n_tokens(vocab);
239}
240
241// ===== MODEL-ACCEPTING CONVENIENCE OVERLOADS =====
242//
243// These overloads accept llama_model* and handle vocab extraction + metadata
244// queries internally. They delegate to the vocab-accepting primitives above.
245//
246// Benefits:
247// - Eliminate boilerplate (vocab extraction, add_bos queries) in calling code
248// - Reduce code duplication across projects
249// - Backwards compatible - existing code unchanged
250
263inline std::vector<llama_token> tokenize(const llama_model *model,
264 const std::string &text) {
265 if (!model) {
266 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: model is null");
267 return {};
268 }
269
270 const llama_vocab *vocab = get_vocab(model);
271 if (!vocab) {
272 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: get_vocab returned null");
273 return {};
274 }
275
276 bool add_bos = llama_vocab_get_add_bos(vocab);
277 return tokenize(vocab, text, add_bos, true);
278}
279
288inline std::string detokenize(const llama_model *model, llama_token token,
289 bool special = true) {
290 if (!model) {
291 LLOYAL_LOG_DEBUG("[tokenizer::detokenize] ERROR: model is null");
292 return "";
293 }
294
295 const llama_vocab *vocab = get_vocab(model);
296 if (!vocab) {
297 LLOYAL_LOG_DEBUG("[tokenizer::detokenize] ERROR: get_vocab returned null");
298 return "";
299 }
300
301 return detokenize(vocab, token, special);
302}
303
315inline std::string detokenize_batch(const llama_model *model,
316 const std::vector<llama_token> &tokens,
317 bool remove_special = false,
318 bool unparse_special = true) {
319 if (!model) {
321 "[tokenizer::detokenize_batch] ERROR: model is null");
322 return "";
323 }
324
325 const llama_vocab *vocab = get_vocab(model);
326 if (!vocab) {
328 "[tokenizer::detokenize_batch] ERROR: get_vocab returned null");
329 return "";
330 }
331
332 return detokenize_batch(vocab, tokens.data(),
333 static_cast<int32_t>(tokens.size()), remove_special,
334 unparse_special);
335}
336
347inline std::string detokenize_batch(const llama_model *model,
348 const llama_token *tokens, int32_t n_tokens,
349 bool remove_special, bool unparse_special) {
350 if (!model) {
352 "[tokenizer::detokenize_batch] ERROR: model is null");
353 return "";
354 }
355
356 const llama_vocab *vocab = get_vocab(model);
357 if (!vocab) {
359 "[tokenizer::detokenize_batch] ERROR: get_vocab returned null");
360 return "";
361 }
362
363 return detokenize_batch(vocab, tokens, n_tokens, remove_special,
364 unparse_special);
365}
366
374inline bool is_eog(const llama_model *model, llama_token token) {
375 if (!model) {
376 LLOYAL_LOG_DEBUG("[tokenizer::is_eog] ERROR: model is null");
377 return false;
378 }
379
380 const llama_vocab *vocab = get_vocab(model);
381 if (!vocab) {
382 LLOYAL_LOG_DEBUG("[tokenizer::is_eog] ERROR: get_vocab returned null");
383 return false;
384 }
385
386 return is_eog(vocab, token);
387}
388
395inline int32_t vocab_size(const llama_model *model) {
396 if (!model) {
397 LLOYAL_LOG_DEBUG("[tokenizer::vocab_size] ERROR: model is null");
398 return 0;
399 }
400
401 const llama_vocab *vocab = get_vocab(model);
402 if (!vocab) {
403 LLOYAL_LOG_DEBUG("[tokenizer::vocab_size] ERROR: get_vocab returned null");
404 return 0;
405 }
406
407 return vocab_size(vocab);
408}
409
410} // namespace lloyal::tokenizer
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:48
int32_t vocab_size(const llama_vocab *vocab)
Get vocabulary size (total number of tokens)
std::vector< llama_token > tokenize(const llama_vocab *vocab, const std::string &text, bool add_special, bool parse_special)
Tokenize text to token array.
Definition tokenizer.hpp:39
std::string detokenize_batch(const llama_vocab *vocab, const llama_token *tokens, int32_t n_tokens, bool remove_special, bool unparse_special)
Detokenize TOKEN ARRAY to text (reconstruction use case)
const llama_vocab * get_vocab(const llama_model *model)
Get vocabulary from model.
std::string detokenize(const llama_vocab *vocab, llama_token token, bool special)
Detokenize SINGLE token to text (streaming use case)
Definition tokenizer.hpp:92
bool is_eog(const llama_vocab *vocab, llama_token token)
Check if token is end-of-generation marker.