liblloyal/embedding_8hpp_source.html

#pragma once


// SPDX-License-Identifier: Apache-2.0

// Copyright 2026 Lloyal Labs


#include "common.hpp"

#include <common.h>  // llama.cpp common library: common_batch_clear, common_batch_add

#include <algorithm>

#include <cmath>

#include <cstdint>

#include <llama/llama.h>

#include <stdexcept>

#include <vector>


namespace lloyal::embedding {


// ===== NORMALIZATION MODES =====


enum class Normalize : int32_t {

  None = 0, // No normalization (raw embeddings)

  L2 = 1,   // L2 normalization (unit length, required for cosine similarity)

};


// ===== MODEL CAPABILITY CHECKS =====


inline bool has_embeddings(const llama_model *model) {

  if (!model) {

    LLOYAL_LOG_DEBUG("[embedding::has_embeddings] ERROR: model is null");

    return false;

  }


  int32_t n_embd = llama_model_n_embd(model);

  return n_embd > 0;

}


inline int32_t dimension(const llama_model *model) {

  if (!model) {

    LLOYAL_LOG_DEBUG("[embedding::dimension] ERROR: model is null");

    return 0;

  }


  return llama_model_n_embd(model);

}


// ===== CONTEXT CAPABILITY CHECKS =====


inline bool has_pooling(llama_context *ctx) {

  if (!ctx) {

    LLOYAL_LOG_DEBUG("[embedding::has_pooling] ERROR: ctx is null");

    return false;

  }


  return llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE;

}


inline int32_t pooling_type(llama_context *ctx) {

  if (!ctx) {

    LLOYAL_LOG_DEBUG("[embedding::pooling_type] ERROR: ctx is null");

    return LLAMA_POOLING_TYPE_NONE;

  }


  return llama_pooling_type(ctx);

}


// ===== INTERNAL HELPERS =====


namespace detail {


inline void apply_l2_normalize(std::vector<float> &vec) {

  if (vec.empty())

    return;


  float norm_sq = 0.0f;

  for (float v : vec) {

    norm_sq += v * v;

  }


  float norm = std::sqrt(norm_sq);

  if (norm > 1e-8f) { // Avoid division by zero

    for (float &v : vec) {

      v /= norm;

    }

  } else {

    LLOYAL_LOG_DEBUG(

        "[embedding::detail::apply_l2_normalize] WARNING: near-zero norm");

  }

}


} // namespace detail


// ===== RAII GUARD FOR BATCH CLEANUP =====


namespace detail {


struct BatchGuard {

  llama_batch &batch;

  explicit BatchGuard(llama_batch &b) : batch(b) {}

  ~BatchGuard() { llama_batch_free(batch); }

};


} // namespace detail


// ===== ENCODING (FORWARD PASS FOR EMBEDDINGS) =====


inline void encode(llama_context *ctx, const llama_token *tokens,

                   int32_t n_tokens, int32_t n_batch) {

  LLOYAL_LOG_DEBUG("[embedding::encode] Encoding %d tokens for embeddings",

                   n_tokens);


  if (!ctx) {

    LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: NULL context");

    throw std::runtime_error("embedding::encode - NULL context");

  }


  if (!tokens || n_tokens <= 0) {

    LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: Invalid token array");

    throw std::runtime_error("embedding::encode - Invalid token array");

  }


  if (n_tokens > n_batch) {

    LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: n_tokens (%d) > n_batch (%d)",

                     n_tokens, n_batch);

    throw std::runtime_error(

        "embedding::encode - token count exceeds batch size (truncation not "

        "supported, increase n_batch or reduce input length)");

  }


  // Initialize batch - single sequence

  llama_batch batch = llama_batch_init(n_batch, 0, 1);

  detail::BatchGuard batch_guard(batch);


  // Clear batch using llama.cpp common library

  common_batch_clear(batch);


  // Add ALL tokens with logits=true (required for embedding extraction)

  for (int32_t i = 0; i < n_tokens; ++i) {

    common_batch_add(batch, tokens[i], i, {0}, true);

  }


  // Decode/encode the batch (llama.cpp handles encoder vs decoder internally)

  if (llama_decode(ctx, batch) != 0) {

    LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: llama_decode failed");

    throw std::runtime_error("embedding::encode - llama_decode failed");

  }


  LLOYAL_LOG_DEBUG("[embedding::encode] Encode complete");

}


inline void encode(llama_context *ctx, const std::vector<llama_token> &tokens,

                   int32_t n_batch) {

  encode(ctx, tokens.data(), static_cast<int32_t>(tokens.size()), n_batch);

}


// ===== EMBEDDING EXTRACTION =====


inline std::vector<float> get(llama_context *ctx,

                              Normalize normalize = Normalize::L2) {

  if (!ctx) {

    LLOYAL_LOG_DEBUG("[embedding::get] ERROR: ctx is null");

    throw std::invalid_argument("embedding::get: ctx is null");

  }


  // Get model to determine embedding dimension

  const llama_model *model = llama_get_model(ctx);

  if (!model) {

    LLOYAL_LOG_DEBUG("[embedding::get] ERROR: failed to get model from context");

    throw std::runtime_error("embedding::get: failed to get model");

  }


  // Warn if pooling not enabled (embeddings may be invalid)

  if (!has_pooling(ctx)) {

    LLOYAL_LOG_DEBUG(

        "[embedding::get] WARNING: pooling not enabled, embeddings may be "

        "invalid. Create context with pooling_type != NONE");

  }


  // Get embeddings pointer from llama.cpp

  // For pooled embeddings, use sequence-specific API (sequence 0)

  const float *embd_ptr = nullptr;

  if (has_pooling(ctx)) {

    embd_ptr = llama_get_embeddings_seq(ctx, 0);

    LLOYAL_LOG_DEBUG("[embedding::get] Using llama_get_embeddings_seq for pooled "

                     "embeddings");

  } else {

    embd_ptr = llama_get_embeddings(ctx);

    LLOYAL_LOG_DEBUG("[embedding::get] Using llama_get_embeddings (no pooling)");

  }


  if (!embd_ptr) {

    LLOYAL_LOG_DEBUG("[embedding::get] ERROR: embeddings pointer is null. "

                     "Ensure context was created with embeddings=true and "

                     "tokens were encoded with logits=true for all tokens.");

    throw std::runtime_error(

        "embedding::get: embeddings unavailable (ensure embeddings=true in "

        "context params and use encode_for_embeddings())");

  }


  // Copy to vector

  int32_t n_embd = llama_model_n_embd(model);

  std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);


  // Apply normalization

  if (normalize == Normalize::L2) {

    detail::apply_l2_normalize(embeddings);

  }


  LLOYAL_LOG_DEBUG("[embedding::get] Extracted embeddings (dim=%d, normalize=%d)",

                   n_embd, static_cast<int>(normalize));


  return embeddings;

}


inline std::vector<float> get_seq(llama_context *ctx, llama_seq_id seq,

                                  Normalize normalize = Normalize::L2) {

  if (!ctx) {

    LLOYAL_LOG_DEBUG("[embedding::get_seq] ERROR: ctx is null");

    throw std::invalid_argument("embedding::get_seq: ctx is null");

  }


  const llama_model *model = llama_get_model(ctx);

  if (!model) {

    LLOYAL_LOG_DEBUG("[embedding::get_seq] ERROR: failed to get model");

    throw std::runtime_error("embedding::get_seq: failed to get model");

  }


  if (!has_pooling(ctx)) {

    LLOYAL_LOG_DEBUG("[embedding::get_seq] WARNING: pooling not enabled");

  }


  // Try sequence-specific API

  const float *embd_ptr = llama_get_embeddings_seq(ctx, seq);


  // Fallback to global embeddings for seq=0

  if (!embd_ptr) {

    if (seq == 0) {

      LLOYAL_LOG_DEBUG("[embedding::get_seq] Falling back to get() for seq=0");

      return get(ctx, normalize);

    }

    LLOYAL_LOG_DEBUG("[embedding::get_seq] ERROR: embeddings unavailable for "

                     "seq=%d",

                     seq);

    throw std::runtime_error("embedding::get_seq: embeddings unavailable");

  }


  int32_t n_embd = llama_model_n_embd(model);

  std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);


  if (normalize == Normalize::L2) {

    detail::apply_l2_normalize(embeddings);

  }


  LLOYAL_LOG_DEBUG("[embedding::get_seq] Extracted embeddings for seq=%d "

                   "(dim=%d)",

                   seq, n_embd);


  return embeddings;

}


inline std::vector<float> get_ith(llama_context *ctx, int32_t idx,

                                  Normalize normalize = Normalize::L2) {

  if (!ctx) {

    LLOYAL_LOG_DEBUG("[embedding::get_ith] ERROR: ctx is null");

    throw std::invalid_argument("embedding::get_ith: ctx is null");

  }


  const llama_model *model = llama_get_model(ctx);

  if (!model) {

    LLOYAL_LOG_DEBUG("[embedding::get_ith] ERROR: failed to get model");

    throw std::runtime_error("embedding::get_ith: failed to get model");

  }


  const float *embd_ptr = llama_get_embeddings_ith(ctx, idx);

  if (!embd_ptr) {

    LLOYAL_LOG_DEBUG("[embedding::get_ith] ERROR: embeddings unavailable for "

                     "idx=%d",

                     idx);

    throw std::runtime_error("embedding::get_ith: embeddings unavailable");

  }


  int32_t n_embd = llama_model_n_embd(model);

  std::vector<float> embeddings(embd_ptr, embd_ptr + n_embd);


  if (normalize == Normalize::L2) {

    detail::apply_l2_normalize(embeddings);

  }


  LLOYAL_LOG_DEBUG("[embedding::get_ith] Extracted embeddings for idx=%d "

                   "(dim=%d)",

                   idx, n_embd);


  return embeddings;

}


// ===== SIMILARITY =====


inline float cosine_similarity(const std::vector<float> &a,

                               const std::vector<float> &b) {

  if (a.size() != b.size()) {

    LLOYAL_LOG_DEBUG("[embedding::cosine_similarity] ERROR: dimension mismatch "

                     "(%zu vs %zu)",

                     a.size(), b.size());

    throw std::invalid_argument(

        "embedding::cosine_similarity: dimension mismatch");

  }


  if (a.empty()) {

    return 0.0f;

  }


  // For L2-normalized vectors, cosine similarity = dot product

  float dot = 0.0f;

  for (size_t i = 0; i < a.size(); ++i) {

    dot += a[i] * b[i];

  }


  return dot;

}


} // namespace lloyal::embedding


common.hpp

LLOYAL_LOG_DEBUG
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:47

lloyal::embedding::detail::apply_l2_normalize
void apply_l2_normalize(std::vector< float > &vec)
Apply L2 normalization to embedding vector (in-place)
Definition embedding.hpp:139

lloyal::embedding
Definition embedding.hpp:40

lloyal::embedding::get_seq
std::vector< float > get_seq(llama_context *ctx, llama_seq_id seq, Normalize normalize=Normalize::L2)
Get embeddings for specific sequence.
Definition embedding.hpp:341

lloyal::embedding::cosine_similarity
float cosine_similarity(const std::vector< float > &a, const std::vector< float > &b)
Compute cosine similarity between two embedding vectors.
Definition embedding.hpp:451

lloyal::embedding::encode
void encode(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_batch)
Encode tokens for embedding extraction.
Definition embedding.hpp:202

lloyal::embedding::get
std::vector< float > get(llama_context *ctx, Normalize normalize=Normalize::L2)
Get embeddings for last decoded batch.
Definition embedding.hpp:271

lloyal::embedding::Normalize
Normalize
Normalization modes for embedding vectors.
Definition embedding.hpp:47

lloyal::embedding::Normalize::None
@ None

lloyal::embedding::Normalize::L2
@ L2

lloyal::embedding::pooling_type
int32_t pooling_type(llama_context *ctx)
Get pooling type for context.
Definition embedding.hpp:120

lloyal::embedding::get_ith
std::vector< float > get_ith(llama_context *ctx, int32_t idx, Normalize normalize=Normalize::L2)
Get embeddings for specific token index in last batch.
Definition embedding.hpp:400

lloyal::embedding::has_embeddings
bool has_embeddings(const llama_model *model)
Check if model supports embeddings.
Definition embedding.hpp:63

lloyal::embedding::has_pooling
bool has_pooling(llama_context *ctx)
Check if context has pooling enabled.
Definition embedding.hpp:99

lloyal::embedding::dimension
int32_t dimension(const llama_model *model)
Get embedding dimension for model.
Definition embedding.hpp:79

lloyal::embedding::detail::BatchGuard
RAII guard for automatic batch cleanup Ensures llama_batch_free is called even if exceptions occur.
Definition embedding.hpp:168

lloyal::embedding::detail::BatchGuard::~BatchGuard
~BatchGuard()
Definition embedding.hpp:171

lloyal::embedding::detail::BatchGuard::batch
llama_batch & batch
Definition embedding.hpp:169

lloyal::embedding::detail::BatchGuard::BatchGuard
BatchGuard(llama_batch &b)
Definition embedding.hpp:170