liblloyal 1.0.0
Branched Inference for llama.cpp
Loading...
Searching...
No Matches
embedding.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
6
7#include "common.hpp"
8#include <common.h> // llama.cpp common library: common_batch_clear, common_batch_add
9#include <algorithm>
10#include <cmath>
11#include <cstdint>
12#include <llama/llama.h>
13#include <stdexcept>
14#include <vector>
15
42
43// ===== NORMALIZATION MODES =====
44
48enum class Normalize : int32_t {
49 None = 0, // No normalization (raw embeddings)
50 L2 = 1, // L2 normalization (unit length, required for cosine similarity)
51};
52
53// ===== MODEL CAPABILITY CHECKS =====
54
64inline bool has_embeddings(const llama_model *model) {
65 if (!model) {
66 LLOYAL_LOG_DEBUG("[embedding::has_embeddings] ERROR: model is null");
67 return false;
68 }
69
70 int32_t n_embd = llama_model_n_embd(model);
71 return n_embd > 0;
72}
73
80inline int32_t dimension(const llama_model *model) {
81 if (!model) {
82 LLOYAL_LOG_DEBUG("[embedding::dimension] ERROR: model is null");
83 return 0;
84 }
85
86 return llama_model_n_embd(model);
87}
88
89// ===== CONTEXT CAPABILITY CHECKS =====
90
100inline bool has_pooling(llama_context *ctx) {
101 if (!ctx) {
102 LLOYAL_LOG_DEBUG("[embedding::has_pooling] ERROR: ctx is null");
103 return false;
104 }
105
106 return llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE;
107}
108
121inline int32_t pooling_type(llama_context *ctx) {
122 if (!ctx) {
123 LLOYAL_LOG_DEBUG("[embedding::pooling_type] ERROR: ctx is null");
124 return LLAMA_POOLING_TYPE_NONE;
125 }
126
127 return llama_pooling_type(ctx);
128}
129
130// ===== INTERNAL HELPERS =====
131
132namespace detail {
133
140inline void apply_l2_normalize(std::vector<float> &vec) {
141 if (vec.empty())
142 return;
143
144 float norm_sq = 0.0f;
145 for (float v : vec) {
146 norm_sq += v * v;
147 }
148
149 float norm = std::sqrt(norm_sq);
150 if (norm > 1e-8f) { // Avoid division by zero
151 for (float &v : vec) {
152 v /= norm;
153 }
154 } else {
156 "[embedding::detail::apply_l2_normalize] WARNING: near-zero norm");
157 }
158}
159
160} // namespace detail
161
162// ===== RAII GUARD FOR BATCH CLEANUP =====
163
164namespace detail {
170 llama_batch &batch;
171 explicit BatchGuard(llama_batch &b) : batch(b) {}
172 ~BatchGuard() { llama_batch_free(batch); }
173};
174} // namespace detail
175
176// ===== ENCODING (FORWARD PASS FOR EMBEDDINGS) =====
177
203inline void encode(llama_context *ctx, const llama_token *tokens,
204 int32_t n_tokens, int32_t n_batch) {
205 LLOYAL_LOG_DEBUG("[embedding::encode] Encoding %d tokens for embeddings",
206 n_tokens);
207
208 if (!ctx) {
209 LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: NULL context");
210 throw std::runtime_error("embedding::encode - NULL context");
211 }
212
213 if (!tokens || n_tokens <= 0) {
214 LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: Invalid token array");
215 throw std::runtime_error("embedding::encode - Invalid token array");
216 }
217
218 if (n_tokens > n_batch) {
219 LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: n_tokens (%d) > n_batch (%d)",
220 n_tokens, n_batch);
221 throw std::runtime_error(
222 "embedding::encode - token count exceeds batch size (truncation not "
223 "supported, increase n_batch or reduce input length)");
224 }
225
226 // Initialize batch - single sequence
227 llama_batch batch = llama_batch_init(n_batch, 0, 1);
228 detail::BatchGuard batch_guard(batch);
229
230 // Clear batch using llama.cpp common library
231 common_batch_clear(batch);
232
233 // Add ALL tokens with logits=true (required for embedding extraction)
234 for (int32_t i = 0; i < n_tokens; ++i) {
235 common_batch_add(batch, tokens[i], i, {0}, true);
236 }
237
238 // Decode/encode the batch (llama.cpp handles encoder vs decoder internally)
239 if (llama_decode(ctx, batch) != 0) {
240 LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: llama_decode failed");
241 throw std::runtime_error("embedding::encode - llama_decode failed");
242 }
243
244 LLOYAL_LOG_DEBUG("[embedding::encode] Encode complete");
245}
246
250inline void encode(llama_context *ctx, const std::vector<llama_token> &tokens,
251 int32_t n_batch) {
252 encode(ctx, tokens.data(), static_cast<int32_t>(tokens.size()), n_batch);
253}
254
255// ===== EMBEDDING EXTRACTION =====
256
272inline std::vector<float> get(llama_context *ctx,
273 Normalize normalize = Normalize::L2) {
274 if (!ctx) {
275 LLOYAL_LOG_DEBUG("[embedding::get] ERROR: ctx is null");
276 throw std::invalid_argument("embedding::get: ctx is null");
277 }
278
279 // Get model to determine embedding dimension
280 const llama_model *model = llama_get_model(ctx);
281 if (!model) {
282 LLOYAL_LOG_DEBUG("[embedding::get] ERROR: failed to get model from context");
283 throw std::runtime_error("embedding::get: failed to get model");
284 }
285
286 // Warn if pooling not enabled (embeddings may be invalid)
287 if (!has_pooling(ctx)) {
289 "[embedding::get] WARNING: pooling not enabled, embeddings may be "
290 "invalid. Create context with pooling_type != NONE");
291 }
292
293 // Get embeddings pointer from llama.cpp
294 // For pooled embeddings, use sequence-specific API (sequence 0)
295 const float *embd_ptr = nullptr;
296 if (has_pooling(ctx)) {
297 embd_ptr = llama_get_embeddings_seq(ctx, 0);
298 LLOYAL_LOG_DEBUG("[embedding::get] Using llama_get_embeddings_seq for pooled "
299 "embeddings");
300 } else {
301 embd_ptr = llama_get_embeddings(ctx);
302 LLOYAL_LOG_DEBUG("[embedding::get] Using llama_get_embeddings (no pooling)");
303 }
304
305 if (!embd_ptr) {
306 LLOYAL_LOG_DEBUG("[embedding::get] ERROR: embeddings pointer is null. "
307 "Ensure context was created with embeddings=true and "
308 "tokens were encoded with logits=true for all tokens.");
309 throw std::runtime_error(
310 "embedding::get: embeddings unavailable (ensure embeddings=true in "
311 "context params and use encode_for_embeddings())");
312 }
313
314 // Copy to vector
315 int32_t n_embd = llama_model_n_embd(model);
316 std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);
317
318 // Apply normalization
319 if (normalize == Normalize::L2) {
320 detail::apply_l2_normalize(embeddings);
321 }
322
323 LLOYAL_LOG_DEBUG("[embedding::get] Extracted embeddings (dim=%d, normalize=%d)",
324 n_embd, static_cast<int>(normalize));
325
326 return embeddings;
327}
328
342inline std::vector<float> get_seq(llama_context *ctx, llama_seq_id seq,
343 Normalize normalize = Normalize::L2) {
344 if (!ctx) {
345 LLOYAL_LOG_DEBUG("[embedding::get_seq] ERROR: ctx is null");
346 throw std::invalid_argument("embedding::get_seq: ctx is null");
347 }
348
349 const llama_model *model = llama_get_model(ctx);
350 if (!model) {
351 LLOYAL_LOG_DEBUG("[embedding::get_seq] ERROR: failed to get model");
352 throw std::runtime_error("embedding::get_seq: failed to get model");
353 }
354
355 if (!has_pooling(ctx)) {
356 LLOYAL_LOG_DEBUG("[embedding::get_seq] WARNING: pooling not enabled");
357 }
358
359 // Try sequence-specific API
360 const float *embd_ptr = llama_get_embeddings_seq(ctx, seq);
361
362 // Fallback to global embeddings for seq=0
363 if (!embd_ptr) {
364 if (seq == 0) {
365 LLOYAL_LOG_DEBUG("[embedding::get_seq] Falling back to get() for seq=0");
366 return get(ctx, normalize);
367 }
368 LLOYAL_LOG_DEBUG("[embedding::get_seq] ERROR: embeddings unavailable for "
369 "seq=%d",
370 seq);
371 throw std::runtime_error("embedding::get_seq: embeddings unavailable");
372 }
373
374 int32_t n_embd = llama_model_n_embd(model);
375 std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);
376
377 if (normalize == Normalize::L2) {
378 detail::apply_l2_normalize(embeddings);
379 }
380
381 LLOYAL_LOG_DEBUG("[embedding::get_seq] Extracted embeddings for seq=%d "
382 "(dim=%d)",
383 seq, n_embd);
384
385 return embeddings;
386}
387
401inline std::vector<float> get_ith(llama_context *ctx, int32_t idx,
402 Normalize normalize = Normalize::L2) {
403 if (!ctx) {
404 LLOYAL_LOG_DEBUG("[embedding::get_ith] ERROR: ctx is null");
405 throw std::invalid_argument("embedding::get_ith: ctx is null");
406 }
407
408 const llama_model *model = llama_get_model(ctx);
409 if (!model) {
410 LLOYAL_LOG_DEBUG("[embedding::get_ith] ERROR: failed to get model");
411 throw std::runtime_error("embedding::get_ith: failed to get model");
412 }
413
414 const float *embd_ptr = llama_get_embeddings_ith(ctx, idx);
415 if (!embd_ptr) {
416 LLOYAL_LOG_DEBUG("[embedding::get_ith] ERROR: embeddings unavailable for "
417 "idx=%d",
418 idx);
419 throw std::runtime_error("embedding::get_ith: embeddings unavailable");
420 }
421
422 int32_t n_embd = llama_model_n_embd(model);
423 std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);
424
425 if (normalize == Normalize::L2) {
426 detail::apply_l2_normalize(embeddings);
427 }
428
429 LLOYAL_LOG_DEBUG("[embedding::get_ith] Extracted embeddings for idx=%d "
430 "(dim=%d)",
431 idx, n_embd);
432
433 return embeddings;
434}
435
436// ===== SIMILARITY =====
437
452inline float cosine_similarity(const std::vector<float> &a,
453 const std::vector<float> &b) {
454 if (a.size() != b.size()) {
455 LLOYAL_LOG_DEBUG("[embedding::cosine_similarity] ERROR: dimension mismatch "
456 "(%zu vs %zu)",
457 a.size(), b.size());
458 throw std::invalid_argument(
459 "embedding::cosine_similarity: dimension mismatch");
460 }
461
462 if (a.empty()) {
463 return 0.0f;
464 }
465
466 // For L2-normalized vectors, cosine similarity = dot product
467 float dot = 0.0f;
468 for (size_t i = 0; i < a.size(); ++i) {
469 dot += a[i] * b[i];
470 }
471
472 return dot;
473}
474
475} // namespace lloyal::embedding
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:48
void apply_l2_normalize(std::vector< float > &vec)
Apply L2 normalization to embedding vector (in-place)
std::vector< float > get_seq(llama_context *ctx, llama_seq_id seq, Normalize normalize=Normalize::L2)
Get embeddings for specific sequence.
float cosine_similarity(const std::vector< float > &a, const std::vector< float > &b)
Compute cosine similarity between two embedding vectors.
void encode(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_batch)
Encode tokens for embedding extraction.
std::vector< float > get(llama_context *ctx, Normalize normalize=Normalize::L2)
Get embeddings for last decoded batch.
Normalize
Normalization modes for embedding vectors.
Definition embedding.hpp:48
int32_t pooling_type(llama_context *ctx)
Get pooling type for context.
std::vector< float > get_ith(llama_context *ctx, int32_t idx, Normalize normalize=Normalize::L2)
Get embeddings for specific token index in last batch.
bool has_embeddings(const llama_model *model)
Check if model supports embeddings.
Definition embedding.hpp:64
bool has_pooling(llama_context *ctx)
Check if context has pooling enabled.
int32_t dimension(const llama_model *model)
Get embedding dimension for model.
Definition embedding.hpp:80
RAII guard for automatic batch cleanup Ensures llama_batch_free is called even if exceptions occur.