11#include <llama/llama.h>
69 int32_t n_embd = llama_model_n_embd(model);
85 return llama_model_n_embd(model);
105 return llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE;
123 return LLAMA_POOLING_TYPE_NONE;
126 return llama_pooling_type(ctx);
143 float norm_sq = 0.0f;
144 for (
float v : vec) {
148 float norm = std::sqrt(norm_sq);
150 for (
float &v : vec) {
155 "[embedding::detail::apply_l2_normalize] WARNING: near-zero norm");
202inline void encode(llama_context *ctx,
const llama_token *tokens,
203 int32_t n_tokens, int32_t n_batch) {
209 throw std::runtime_error(
"embedding::encode - NULL context");
212 if (!tokens || n_tokens <= 0) {
214 throw std::runtime_error(
"embedding::encode - Invalid token array");
217 if (n_tokens > n_batch) {
220 throw std::runtime_error(
221 "embedding::encode - token count exceeds batch size (truncation not "
222 "supported, increase n_batch or reduce input length)");
226 llama_batch batch = llama_batch_init(n_batch, 0, 1);
230 common_batch_clear(batch);
233 for (int32_t i = 0; i < n_tokens; ++i) {
234 common_batch_add(batch, tokens[i], i, {0},
true);
238 if (llama_decode(ctx, batch) != 0) {
240 throw std::runtime_error(
"embedding::encode - llama_decode failed");
249inline void encode(llama_context *ctx,
const std::vector<llama_token> &tokens,
251 encode(ctx, tokens.data(),
static_cast<int32_t
>(tokens.size()), n_batch);
271inline std::vector<float>
get(llama_context *ctx,
275 throw std::invalid_argument(
"embedding::get: ctx is null");
279 const llama_model *model = llama_get_model(ctx);
281 LLOYAL_LOG_DEBUG(
"[embedding::get] ERROR: failed to get model from context");
282 throw std::runtime_error(
"embedding::get: failed to get model");
288 "[embedding::get] WARNING: pooling not enabled, embeddings may be "
289 "invalid. Create context with pooling_type != NONE");
294 const float *embd_ptr =
nullptr;
296 embd_ptr = llama_get_embeddings_seq(ctx, 0);
297 LLOYAL_LOG_DEBUG(
"[embedding::get] Using llama_get_embeddings_seq for pooled "
300 embd_ptr = llama_get_embeddings(ctx);
301 LLOYAL_LOG_DEBUG(
"[embedding::get] Using llama_get_embeddings (no pooling)");
306 "Ensure context was created with embeddings=true and "
307 "tokens were encoded with logits=true for all tokens.");
308 throw std::runtime_error(
309 "embedding::get: embeddings unavailable (ensure embeddings=true in "
310 "context params and use encode_for_embeddings())");
314 int32_t n_embd = llama_model_n_embd(model);
315 std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);
322 LLOYAL_LOG_DEBUG(
"[embedding::get] Extracted embeddings (dim=%d, normalize=%d)",
323 n_embd,
static_cast<int>(normalize));
341inline std::vector<float>
get_seq(llama_context *ctx, llama_seq_id seq,
345 throw std::invalid_argument(
"embedding::get_seq: ctx is null");
348 const llama_model *model = llama_get_model(ctx);
351 throw std::runtime_error(
"embedding::get_seq: failed to get model");
359 const float *embd_ptr = llama_get_embeddings_seq(ctx, seq);
365 return get(ctx, normalize);
370 throw std::runtime_error(
"embedding::get_seq: embeddings unavailable");
373 int32_t n_embd = llama_model_n_embd(model);
374 std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);
400inline std::vector<float>
get_ith(llama_context *ctx, int32_t idx,
404 throw std::invalid_argument(
"embedding::get_ith: ctx is null");
407 const llama_model *model = llama_get_model(ctx);
410 throw std::runtime_error(
"embedding::get_ith: failed to get model");
413 const float *embd_ptr = llama_get_embeddings_ith(ctx, idx);
418 throw std::runtime_error(
"embedding::get_ith: embeddings unavailable");
421 int32_t n_embd = llama_model_n_embd(model);
422 std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);
452 const std::vector<float> &b) {
453 if (a.size() != b.size()) {
454 LLOYAL_LOG_DEBUG(
"[embedding::cosine_similarity] ERROR: dimension mismatch "
457 throw std::invalid_argument(
458 "embedding::cosine_similarity: dimension mismatch");
467 for (
size_t i = 0; i < a.size(); ++i) {
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
void apply_l2_normalize(std::vector< float > &vec)
Apply L2 normalization to embedding vector (in-place)
std::vector< float > get_seq(llama_context *ctx, llama_seq_id seq, Normalize normalize=Normalize::L2)
Get embeddings for specific sequence.
float cosine_similarity(const std::vector< float > &a, const std::vector< float > &b)
Compute cosine similarity between two embedding vectors.
void encode(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_batch)
Encode tokens for embedding extraction.
std::vector< float > get(llama_context *ctx, Normalize normalize=Normalize::L2)
Get embeddings for last decoded batch.
Normalize
Normalization modes for embedding vectors.
int32_t pooling_type(llama_context *ctx)
Get pooling type for context.
std::vector< float > get_ith(llama_context *ctx, int32_t idx, Normalize normalize=Normalize::L2)
Get embeddings for specific token index in last batch.
bool has_embeddings(const llama_model *model)
Check if model supports embeddings.
bool has_pooling(llama_context *ctx)
Check if context has pooling enabled.
int32_t dimension(const llama_model *model)
Get embedding dimension for model.
RAII guard for automatic batch cleanup Ensures llama_batch_free is called even if exceptions occur.
BatchGuard(llama_batch &b)