liblloyal/decoder_8hpp_source.html

#pragma once


// SPDX-License-Identifier: Apache-2.0

// Copyright 2026 Lloyal Labs


#include "common.hpp"

#include "helpers.hpp"

#include <algorithm>

#include <cstdint>

#include <llama/llama.h>

#include <stdexcept>

#include <vector>


#ifndef LLOYAL_STACK_BATCH

#define LLOYAL_STACK_BATCH 1

#endif


namespace lloyal::detail {


struct BatchGuard {

  llama_batch &batch;

  explicit BatchGuard(llama_batch &b) : batch(b) {}

  ~BatchGuard() { llama_batch_free(batch); }

};


inline void add_tokens_to_batch(llama_batch &batch, const llama_token *tokens,

                                int32_t start_idx, int32_t n_eval,

                                int32_t n_past, int32_t capacity,

                                llama_seq_id seq_id = 0) {

  // Clear batch using helpers.hpp function

  lloyal::batch_clear(batch);


  // Add tokens one by one, mark logits=true on LAST token only

  for (int32_t i = 0; i < n_eval; ++i) {

    const int32_t pos = n_past + i;

    const bool want_logits = (i == n_eval - 1);


    // Add token to specified sequence

    lloyal::batch_add(batch, tokens[start_idx + i], pos, {seq_id}, want_logits,

                      capacity);

  }

}


} // namespace lloyal::detail


namespace lloyal::decoder {


inline void decode_tokens(llama_context *ctx, const llama_token *tokens,

                          int32_t n_tokens, int32_t n_past, int32_t n_batch,

                          llama_seq_id seq_id = 0) {

  LLOYAL_LOG_DEBUG(

      "[decoder::decode_tokens] Processing %d tokens at position %d", n_tokens,

      n_past);


  if (!ctx) {

    LLOYAL_LOG_DEBUG("[decoder::decode_tokens] ERROR: NULL context");

    throw std::runtime_error("decoder::decode_tokens - NULL context");

  }


  if (!tokens || n_tokens <= 0) {

    LLOYAL_LOG_DEBUG("[decoder::decode_tokens] ERROR: Invalid token array");

    throw std::runtime_error("decoder::decode_tokens - Invalid token array");

  }


  // Initialize batch with RAII cleanup

  // Single-sequence batch (n_seq_max = 1)

  llama_batch batch = llama_batch_init(n_batch, 0, 1);

  detail::BatchGuard batch_guard(batch);


  // Process tokens in chunks

  int32_t processed = 0;

  while (processed < n_tokens) {

    const int32_t n_eval = std::min(n_tokens - processed, n_batch);


    // Add chunk to batch

    detail::add_tokens_to_batch(batch, tokens, processed, n_eval, n_past,

                                n_batch, seq_id);


    // Decode chunk (updates KV cache)

    if (llama_decode(ctx, batch) != 0) {

      LLOYAL_LOG_DEBUG(

          "[decoder::decode_tokens] ERROR: llama_decode failed at position %d",

          n_past);

      throw std::runtime_error("decoder::decode_tokens - llama_decode failed");

    }


    n_past += n_eval;

    processed += n_eval;


    LLOYAL_LOG_DEBUG("[decoder::decode_tokens] Processed %d/%d tokens",

                     processed, n_tokens);

  }


  LLOYAL_LOG_DEBUG("[decoder::decode_tokens] Decode complete");

}


inline void decode_tokens(llama_context *ctx,

                          const std::vector<llama_token> &tokens,

                          int32_t n_past, int32_t n_batch,

                          llama_seq_id seq_id = 0) {

  decode_tokens(ctx, tokens.data(), static_cast<int32_t>(tokens.size()), n_past,

                n_batch, seq_id);

}


inline void decode_one(llama_context *ctx, llama_token tok, llama_pos pos,

                       llama_seq_id seq_id = 0, bool want_logits = true) {

  if (!ctx) {

    throw std::runtime_error("decoder::decode_one - NULL context");

  }


#if LLOYAL_STACK_BATCH

  // Fast path: zero-allocation stack-constructed batch

  // WARNING: ABI-fragile - breaks if llama_batch struct layout changes

  llama_token tok_arr[1] = {tok};

  llama_pos pos_arr[1] = {pos};

  int32_t n_seq_id_arr[1] = {1};

  llama_seq_id seq_arr[1] = {seq_id};

  llama_seq_id *seq_ptrs[1] = {seq_arr};

  int8_t logits_arr[1] = {static_cast<int8_t>(want_logits)};


  llama_batch batch{};

  batch.n_tokens = 1;

  batch.token = tok_arr;

  batch.embd = nullptr;

  batch.pos = pos_arr;

  batch.n_seq_id = n_seq_id_arr;

  batch.seq_id = seq_ptrs;

  batch.logits = logits_arr;

#else

  // Safe path: thread_local batch via llama.cpp's own initializer

  // Handles any new fields with defaults, survives ABI changes

  thread_local llama_batch batch = llama_batch_init(1, 0, 1);


  batch.n_tokens = 1;

  batch.token[0] = tok;

  batch.pos[0] = pos;

  batch.n_seq_id[0] = 1;

  batch.seq_id[0][0] = seq_id;

  batch.logits[0] = static_cast<int8_t>(want_logits);

#endif


  if (llama_decode(ctx, batch) != 0) {

    throw std::runtime_error("decoder::decode_one - llama_decode failed");

  }

}


} // namespace lloyal::decoder


common.hpp

LLOYAL_LOG_DEBUG
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:47

helpers.hpp
Helper Utilities.

lloyal::decoder
Definition decoder.hpp:78

lloyal::decoder::decode_one
void decode_one(llama_context *ctx, llama_token tok, llama_pos pos, llama_seq_id seq_id=0, bool want_logits=true)
Decode a single token with zero heap allocation (when LLOYAL_STACK_BATCH=1)
Definition decoder.hpp:202

lloyal::decoder::decode_tokens
void decode_tokens(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_past, int32_t n_batch, llama_seq_id seq_id=0)
Process tokens through model to update KV cache.
Definition decoder.hpp:127

lloyal::detail
Definition decoder.hpp:45

lloyal::detail::add_tokens_to_batch
void add_tokens_to_batch(llama_batch &batch, const llama_token *tokens, int32_t start_idx, int32_t n_eval, int32_t n_past, int32_t capacity, llama_seq_id seq_id=0)
Add tokens to batch with position info.
Definition decoder.hpp:59

lloyal::batch_clear
void batch_clear(llama_batch &batch)
Clear batch to empty state.
Definition helpers.hpp:64

lloyal::batch_add
void batch_add(llama_batch &batch, llama_token id, int32_t pos, const std::vector< llama_seq_id > &seq_ids, bool logits, int32_t capacity=-1)
Add single token to batch with position and sequence info.
Definition helpers.hpp:84

lloyal::detail::BatchGuard
RAII guard for automatic batch cleanup Ensures llama_batch_free is called even if exceptions occur.
Definition decoder.hpp:50

lloyal::detail::BatchGuard::batch
llama_batch & batch
Definition decoder.hpp:51

lloyal::detail::BatchGuard::~BatchGuard
~BatchGuard()
Definition decoder.hpp:53

lloyal::detail::BatchGuard::BatchGuard
BatchGuard(llama_batch &b)
Definition decoder.hpp:52