liblloyal 1.0.0
Composable primitives for llama.cpp inference
Loading...
Searching...
No Matches
/home/runner/work/liblloyal/liblloyal/include/lloyal/embedding.hpp

// Check model supports embeddings if (embedding::has_embeddings(model)) { int32_t dim = embedding::dimension(model);

// Check model supports embeddings if (embedding::has_embeddings(model)) { int32_t dim = embedding::dimension(model); // Decode tokens with pooling enabled decoder::decode_tokens(ctx, tokens, 0, 512);

// Extract normalized embeddings auto vec = embedding::get(ctx, embedding::Normalize::L2); }

#pragma once
// SPDX-License-Identifier: Apache-2.0
// Copyright 2026 Lloyal Labs
#include "common.hpp"
#include "helpers.hpp"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <llama/llama.h>
#include <stdexcept>
#include <vector>
namespace lloyal::embedding {
// ===== NORMALIZATION MODES =====
enum class Normalize : int32_t {
None = 0, // No normalization (raw embeddings)
L2 = 1, // L2 normalization (unit length, required for cosine similarity)
};
// ===== MODEL CAPABILITY CHECKS =====
inline bool has_embeddings(const llama_model *model) {
if (!model) {
LLOYAL_LOG_DEBUG("[embedding::has_embeddings] ERROR: model is null");
return false;
}
int32_t n_embd = llama_model_n_embd(model);
return n_embd > 0;
}
inline int32_t dimension(const llama_model *model) {
if (!model) {
LLOYAL_LOG_DEBUG("[embedding::dimension] ERROR: model is null");
return 0;
}
return llama_model_n_embd(model);
}
// ===== CONTEXT CAPABILITY CHECKS =====
inline bool has_pooling(llama_context *ctx) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[embedding::has_pooling] ERROR: ctx is null");
return false;
}
return llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE;
}
inline int32_t pooling_type(llama_context *ctx) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[embedding::pooling_type] ERROR: ctx is null");
return LLAMA_POOLING_TYPE_NONE;
}
return llama_pooling_type(ctx);
}
// ===== INTERNAL HELPERS =====
namespace detail {
inline void apply_l2_normalize(std::vector<float> &vec) {
if (vec.empty())
return;
float norm_sq = 0.0f;
for (float v : vec) {
norm_sq += v * v;
}
float norm = std::sqrt(norm_sq);
if (norm > 1e-8f) { // Avoid division by zero
for (float &v : vec) {
v /= norm;
}
} else {
"[embedding::detail::apply_l2_normalize] WARNING: near-zero norm");
}
}
} // namespace detail
// ===== RAII GUARD FOR BATCH CLEANUP =====
namespace detail {
struct BatchGuard {
llama_batch &batch;
explicit BatchGuard(llama_batch &b) : batch(b) {}
~BatchGuard() { llama_batch_free(batch); }
};
} // namespace detail
// ===== ENCODING (FORWARD PASS FOR EMBEDDINGS) =====
inline void encode(llama_context *ctx, const llama_token *tokens,
int32_t n_tokens, int32_t n_batch) {
LLOYAL_LOG_DEBUG("[embedding::encode] Encoding %d tokens for embeddings",
n_tokens);
if (!ctx) {
LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: NULL context");
throw std::runtime_error("embedding::encode - NULL context");
}
if (!tokens || n_tokens <= 0) {
LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: Invalid token array");
throw std::runtime_error("embedding::encode - Invalid token array");
}
if (n_tokens > n_batch) {
LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: n_tokens (%d) > n_batch (%d)",
n_tokens, n_batch);
throw std::runtime_error(
"embedding::encode - token count exceeds batch size (truncation not "
"supported, increase n_batch or reduce input length)");
}
// Initialize batch - single sequence
llama_batch batch = llama_batch_init(n_batch, 0, 1);
detail::BatchGuard batch_guard(batch);
// Clear batch
// Add ALL tokens with logits=true (required for embedding extraction)
for (int32_t i = 0; i < n_tokens; ++i) {
lloyal::batch_add(batch, tokens[i], i, {0}, true, n_batch);
}
// Decode/encode the batch (llama.cpp handles encoder vs decoder internally)
if (llama_decode(ctx, batch) != 0) {
LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: llama_decode failed");
throw std::runtime_error("embedding::encode - llama_decode failed");
}
LLOYAL_LOG_DEBUG("[embedding::encode] Encode complete");
}
inline void encode(llama_context *ctx, const std::vector<llama_token> &tokens,
int32_t n_batch) {
encode(ctx, tokens.data(), static_cast<int32_t>(tokens.size()), n_batch);
}
// ===== EMBEDDING EXTRACTION =====
inline std::vector<float> get(llama_context *ctx,
Normalize normalize = Normalize::L2) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[embedding::get] ERROR: ctx is null");
throw std::invalid_argument("embedding::get: ctx is null");
}
// Get model to determine embedding dimension
const llama_model *model = llama_get_model(ctx);
if (!model) {
LLOYAL_LOG_DEBUG("[embedding::get] ERROR: failed to get model from context");
throw std::runtime_error("embedding::get: failed to get model");
}
// Warn if pooling not enabled (embeddings may be invalid)
if (!has_pooling(ctx)) {
"[embedding::get] WARNING: pooling not enabled, embeddings may be "
"invalid. Create context with pooling_type != NONE");
}
// Get embeddings pointer from llama.cpp
// For pooled embeddings, use sequence-specific API (sequence 0)
const float *embd_ptr = nullptr;
if (has_pooling(ctx)) {
embd_ptr = llama_get_embeddings_seq(ctx, 0);
LLOYAL_LOG_DEBUG("[embedding::get] Using llama_get_embeddings_seq for pooled "
"embeddings");
} else {
embd_ptr = llama_get_embeddings(ctx);
LLOYAL_LOG_DEBUG("[embedding::get] Using llama_get_embeddings (no pooling)");
}
if (!embd_ptr) {
LLOYAL_LOG_DEBUG("[embedding::get] ERROR: embeddings pointer is null. "
"Ensure context was created with embeddings=true and "
"tokens were encoded with logits=true for all tokens.");
throw std::runtime_error(
"embedding::get: embeddings unavailable (ensure embeddings=true in "
"context params and use encode_for_embeddings())");
}
// Copy to vector
int32_t n_embd = llama_model_n_embd(model);
std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);
// Apply normalization
if (normalize == Normalize::L2) {
}
LLOYAL_LOG_DEBUG("[embedding::get] Extracted embeddings (dim=%d, normalize=%d)",
n_embd, static_cast<int>(normalize));
return embeddings;
}
inline std::vector<float> get_seq(llama_context *ctx, llama_seq_id seq,
Normalize normalize = Normalize::L2) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[embedding::get_seq] ERROR: ctx is null");
throw std::invalid_argument("embedding::get_seq: ctx is null");
}
const llama_model *model = llama_get_model(ctx);
if (!model) {
LLOYAL_LOG_DEBUG("[embedding::get_seq] ERROR: failed to get model");
throw std::runtime_error("embedding::get_seq: failed to get model");
}
if (!has_pooling(ctx)) {
LLOYAL_LOG_DEBUG("[embedding::get_seq] WARNING: pooling not enabled");
}
// Try sequence-specific API
const float *embd_ptr = llama_get_embeddings_seq(ctx, seq);
// Fallback to global embeddings for seq=0
if (!embd_ptr) {
if (seq == 0) {
LLOYAL_LOG_DEBUG("[embedding::get_seq] Falling back to get() for seq=0");
return get(ctx, normalize);
}
LLOYAL_LOG_DEBUG("[embedding::get_seq] ERROR: embeddings unavailable for "
"seq=%d",
seq);
throw std::runtime_error("embedding::get_seq: embeddings unavailable");
}
int32_t n_embd = llama_model_n_embd(model);
std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);
if (normalize == Normalize::L2) {
}
LLOYAL_LOG_DEBUG("[embedding::get_seq] Extracted embeddings for seq=%d "
"(dim=%d)",
seq, n_embd);
return embeddings;
}
inline std::vector<float> get_ith(llama_context *ctx, int32_t idx,
Normalize normalize = Normalize::L2) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[embedding::get_ith] ERROR: ctx is null");
throw std::invalid_argument("embedding::get_ith: ctx is null");
}
const llama_model *model = llama_get_model(ctx);
if (!model) {
LLOYAL_LOG_DEBUG("[embedding::get_ith] ERROR: failed to get model");
throw std::runtime_error("embedding::get_ith: failed to get model");
}
const float *embd_ptr = llama_get_embeddings_ith(ctx, idx);
if (!embd_ptr) {
LLOYAL_LOG_DEBUG("[embedding::get_ith] ERROR: embeddings unavailable for "
"idx=%d",
idx);
throw std::runtime_error("embedding::get_ith: embeddings unavailable");
}
int32_t n_embd = llama_model_n_embd(model);
std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);
if (normalize == Normalize::L2) {
}
LLOYAL_LOG_DEBUG("[embedding::get_ith] Extracted embeddings for idx=%d "
"(dim=%d)",
idx, n_embd);
return embeddings;
}
// ===== SIMILARITY =====
inline float cosine_similarity(const std::vector<float> &a,
const std::vector<float> &b) {
if (a.size() != b.size()) {
LLOYAL_LOG_DEBUG("[embedding::cosine_similarity] ERROR: dimension mismatch "
"(%zu vs %zu)",
a.size(), b.size());
throw std::invalid_argument(
"embedding::cosine_similarity: dimension mismatch");
}
if (a.empty()) {
return 0.0f;
}
// For L2-normalized vectors, cosine similarity = dot product
float dot = 0.0f;
for (size_t i = 0; i < a.size(); ++i) {
dot += a[i] * b[i];
}
return dot;
}
} // namespace lloyal::embedding
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:47
Helper Utilities.
void apply_l2_normalize(std::vector< float > &vec)
Apply L2 normalization to embedding vector (in-place)
std::vector< float > get_seq(llama_context *ctx, llama_seq_id seq, Normalize normalize=Normalize::L2)
Get embeddings for specific sequence.
float cosine_similarity(const std::vector< float > &a, const std::vector< float > &b)
Compute cosine similarity between two embedding vectors.
void encode(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_batch)
Encode tokens for embedding extraction.
std::vector< float > get(llama_context *ctx, Normalize normalize=Normalize::L2)
Get embeddings for last decoded batch.
Normalize
Normalization modes for embedding vectors.
Definition embedding.hpp:47
int32_t pooling_type(llama_context *ctx)
Get pooling type for context.
std::vector< float > get_ith(llama_context *ctx, int32_t idx, Normalize normalize=Normalize::L2)
Get embeddings for specific token index in last batch.
bool has_embeddings(const llama_model *model)
Check if model supports embeddings.
Definition embedding.hpp:63
bool has_pooling(llama_context *ctx)
Check if context has pooling enabled.
Definition embedding.hpp:99
int32_t dimension(const llama_model *model)
Get embedding dimension for model.
Definition embedding.hpp:79
void batch_clear(llama_batch &batch)
Clear batch to empty state.
Definition helpers.hpp:64
void batch_add(llama_batch &batch, llama_token id, int32_t pos, const std::vector< llama_seq_id > &seq_ids, bool logits, int32_t capacity=-1)
Add single token to batch with position and sequence info.
Definition helpers.hpp:84