liblloyal 1.0.0
Branched Inference for llama.cpp
Loading...
Searching...
No Matches
tokenizer.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
6#include "common.hpp"
7#include <cstdint>
8#include <llama/llama.h>
9#include <string>
10#include <vector>
11
26
27// ===== TOKENIZATION (TEXT → TOKENS) =====
28
38inline std::vector<llama_token> tokenize(const llama_vocab *vocab,
39 const std::string &text,
40 bool add_special, bool parse_special) {
41 if (!vocab) {
42 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: vocab is null");
43 return {};
44 }
45
46 // Two-pass algorithm for safety:
47 // Pass 1: Determine required buffer size (negative return = size needed)
48 const int n_tokens =
49 -llama_tokenize(vocab, text.c_str(), static_cast<int32_t>(text.length()),
50 nullptr, // null buffer to get size
51 0, add_special, parse_special);
52
53 if (n_tokens <= 0) {
54 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] WARNING: Empty tokenization result "
55 "for text: '%.50s...'",
56 text.c_str());
57 return {};
58 }
59
60 // Pass 2: Actual tokenization
61 std::vector<llama_token> tokens(n_tokens);
62 const int n_tokenized =
63 llama_tokenize(vocab, text.c_str(), static_cast<int32_t>(text.length()),
64 tokens.data(), n_tokens, add_special, parse_special);
65
66 if (n_tokenized != n_tokens) {
67 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: Token count mismatch "
68 "(expected %d, got %d)",
69 n_tokens, n_tokenized);
70 return {};
71 }
72
73 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] Tokenized %zu bytes → %d tokens",
74 text.length(), n_tokens);
75 return tokens;
76}
77
78// ===== DETOKENIZATION (TOKENS → TEXT) =====
79
91inline std::string detokenize(const llama_vocab *vocab, llama_token token,
92 bool special) {
93 if (!vocab) {
94 LLOYAL_LOG_DEBUG("[tokenizer::detokenize] ERROR: vocab is null");
95 return "";
96 }
97
98 // Two-pass algorithm (vendored from llama.cpp/common/common.cpp)
99 std::string piece(64, '\0');
100
101 const int n_chars =
102 llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
103
104 if (n_chars < 0) {
105 // Buffer too small, resize and retry
106 piece.resize(-n_chars);
107 int check =
108 llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
109 if (check != -n_chars) {
111 "[tokenizer::detokenize] ERROR: Inconsistent sizing for token %d",
112 token);
113 return "";
114 }
115 } else {
116 piece.resize(n_chars);
117 }
118
119 return piece;
120}
121
135inline std::string detokenize_batch(const llama_vocab *vocab,
136 const llama_token *tokens, int32_t n_tokens,
137 bool remove_special, bool unparse_special) {
138 if (!vocab || !tokens || n_tokens <= 0) {
139 LLOYAL_LOG_DEBUG("[tokenizer::detokenize_batch] ERROR: Invalid parameters "
140 "(vocab=%p, tokens=%p, n_tokens=%d)",
141 vocab, tokens, n_tokens);
142 return "";
143 }
144
145 // Two-pass algorithm for safety:
146 // Pass 1: Determine required buffer size (negative return = size needed)
147 int32_t required_size = llama_detokenize(vocab, tokens, n_tokens,
148 nullptr, // null buffer to get size
149 0, remove_special, unparse_special);
150
151 if (required_size < 0) {
152 // Negative return means we need abs(required_size) bytes
153 required_size = -required_size;
154 }
155
156 if (required_size == 0) {
157 LLOYAL_LOG_DEBUG("[tokenizer::detokenize_batch] WARNING: Empty "
158 "detokenization result for %d tokens",
159 n_tokens);
160 return "";
161 }
162
163 // Pass 2: Actual detokenization
164 std::vector<char> buffer(required_size + 1); // +1 for null terminator
165 int32_t written =
166 llama_detokenize(vocab, tokens, n_tokens, buffer.data(), required_size,
167 remove_special, unparse_special);
168
169 if (written < 0) {
170 LLOYAL_LOG_DEBUG("[tokenizer::detokenize_batch] ERROR: Detokenization "
171 "failed (needed %d bytes, got %d)",
172 required_size, written);
173 return "";
174 }
175
176 std::string result(buffer.data(), written);
178 "[tokenizer::detokenize_batch] Detokenized %d tokens → %zu bytes",
179 n_tokens, result.size());
180 return result;
181}
182
183// ===== VOCABULARY QUERIES =====
184
194inline const llama_vocab *get_vocab(const llama_model *model) {
195 if (!model) {
196 LLOYAL_LOG_DEBUG("[tokenizer::get_vocab] ERROR: NULL model");
197 return nullptr;
198 }
199
200 const llama_vocab *vocab = llama_model_get_vocab(model);
201 if (!vocab) {
203 "[tokenizer::get_vocab] ERROR: llama_model_get_vocab returned NULL");
204 }
205
206 return vocab;
207}
208
216inline bool is_eog(const llama_vocab *vocab, llama_token token) {
217 if (!vocab) {
218 LLOYAL_LOG_DEBUG("[tokenizer::is_eog] ERROR: vocab is null");
219 return false;
220 }
221
222 return llama_vocab_is_eog(vocab, token);
223}
224
231inline int32_t vocab_size(const llama_vocab *vocab) {
232 if (!vocab) {
233 LLOYAL_LOG_DEBUG("[tokenizer::vocab_size] ERROR: vocab is null");
234 return 0;
235 }
236
237 return llama_vocab_n_tokens(vocab);
238}
239
240// ===== MODEL-ACCEPTING CONVENIENCE OVERLOADS =====
241//
242// These overloads accept llama_model* and handle vocab extraction + metadata
243// queries internally. They delegate to the vocab-accepting primitives above.
244//
245// Benefits:
246// - Eliminate boilerplate (vocab extraction, add_bos queries) in calling code
247// - Reduce code duplication across projects
248// - Backwards compatible - existing code unchanged
249
262inline std::vector<llama_token> tokenize(const llama_model *model,
263 const std::string &text) {
264 if (!model) {
265 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: model is null");
266 return {};
267 }
268
269 const llama_vocab *vocab = get_vocab(model);
270 if (!vocab) {
271 LLOYAL_LOG_DEBUG("[tokenizer::tokenize] ERROR: get_vocab returned null");
272 return {};
273 }
274
275 bool add_bos = llama_vocab_get_add_bos(vocab);
276 return tokenize(vocab, text, add_bos, true);
277}
278
287inline std::string detokenize(const llama_model *model, llama_token token,
288 bool special = true) {
289 if (!model) {
290 LLOYAL_LOG_DEBUG("[tokenizer::detokenize] ERROR: model is null");
291 return "";
292 }
293
294 const llama_vocab *vocab = get_vocab(model);
295 if (!vocab) {
296 LLOYAL_LOG_DEBUG("[tokenizer::detokenize] ERROR: get_vocab returned null");
297 return "";
298 }
299
300 return detokenize(vocab, token, special);
301}
302
314inline std::string detokenize_batch(const llama_model *model,
315 const std::vector<llama_token> &tokens,
316 bool remove_special = false,
317 bool unparse_special = true) {
318 if (!model) {
320 "[tokenizer::detokenize_batch] ERROR: model is null");
321 return "";
322 }
323
324 const llama_vocab *vocab = get_vocab(model);
325 if (!vocab) {
327 "[tokenizer::detokenize_batch] ERROR: get_vocab returned null");
328 return "";
329 }
330
331 return detokenize_batch(vocab, tokens.data(),
332 static_cast<int32_t>(tokens.size()), remove_special,
333 unparse_special);
334}
335
346inline std::string detokenize_batch(const llama_model *model,
347 const llama_token *tokens, int32_t n_tokens,
348 bool remove_special, bool unparse_special) {
349 if (!model) {
351 "[tokenizer::detokenize_batch] ERROR: model is null");
352 return "";
353 }
354
355 const llama_vocab *vocab = get_vocab(model);
356 if (!vocab) {
358 "[tokenizer::detokenize_batch] ERROR: get_vocab returned null");
359 return "";
360 }
361
362 return detokenize_batch(vocab, tokens, n_tokens, remove_special,
363 unparse_special);
364}
365
373inline bool is_eog(const llama_model *model, llama_token token) {
374 if (!model) {
375 LLOYAL_LOG_DEBUG("[tokenizer::is_eog] ERROR: model is null");
376 return false;
377 }
378
379 const llama_vocab *vocab = get_vocab(model);
380 if (!vocab) {
381 LLOYAL_LOG_DEBUG("[tokenizer::is_eog] ERROR: get_vocab returned null");
382 return false;
383 }
384
385 return is_eog(vocab, token);
386}
387
394inline int32_t vocab_size(const llama_model *model) {
395 if (!model) {
396 LLOYAL_LOG_DEBUG("[tokenizer::vocab_size] ERROR: model is null");
397 return 0;
398 }
399
400 const llama_vocab *vocab = get_vocab(model);
401 if (!vocab) {
402 LLOYAL_LOG_DEBUG("[tokenizer::vocab_size] ERROR: get_vocab returned null");
403 return 0;
404 }
405
406 return vocab_size(vocab);
407}
408
409} // namespace lloyal::tokenizer
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:47
int32_t vocab_size(const llama_vocab *vocab)
Get vocabulary size (total number of tokens)
std::vector< llama_token > tokenize(const llama_vocab *vocab, const std::string &text, bool add_special, bool parse_special)
Tokenize text to token array.
Definition tokenizer.hpp:38
std::string detokenize_batch(const llama_vocab *vocab, const llama_token *tokens, int32_t n_tokens, bool remove_special, bool unparse_special)
Detokenize TOKEN ARRAY to text (reconstruction use case)
const llama_vocab * get_vocab(const llama_model *model)
Get vocabulary from model.
std::string detokenize(const llama_vocab *vocab, llama_token token, bool special)
Detokenize SINGLE token to text (streaming use case)
Definition tokenizer.hpp:91
bool is_eog(const llama_vocab *vocab, llama_token token)
Check if token is end-of-generation marker.