liblloyal/decode_8hpp_source.html

#pragma once


// SPDX-License-Identifier: Apache-2.0

// Copyright 2026 Lloyal Labs


#include "common.hpp"

#include <common.h>  // llama.cpp common library: common_batch_clear, common_batch_add

#include <algorithm>

#include <cstdint>

#include <llama/llama.h>

#include <span>

#include <stdexcept>

#include <vector>


namespace lloyal::decode {


[[nodiscard]] inline int many(llama_context *ctx, const llama_token *tokens,

                               int32_t n_tokens, int32_t n_past, int32_t n_batch,

                               llama_seq_id seq_id = 0) {

  LLOYAL_LOG_DEBUG(

      "[decode::many] Processing %d tokens at position %d", n_tokens,

      n_past);


  if (!ctx) {

    LLOYAL_LOG_DEBUG("[decode::many] ERROR: NULL context");

    throw std::runtime_error("decode::many - NULL context");

  }


  if (!tokens || n_tokens <= 0) {

    LLOYAL_LOG_DEBUG("[decode::many] ERROR: Invalid token array");

    throw std::runtime_error("decode::many - Invalid token array");

  }


  if (n_batch <= 0) {

    throw std::runtime_error("decode::many - n_batch must be positive");

  }


  // Thread-local batch avoids per-call allocation. Grows if needed, never shrinks.

  struct ThreadLocalBatch {

    llama_batch batch{};

    int32_t capacity = 0;


    void ensure(int32_t n) {

      if (n <= capacity) return;

      if (capacity > 0) llama_batch_free(batch);

      batch = llama_batch_init(n, 0, 1);

      capacity = n;

    }


    ~ThreadLocalBatch() {

      if (capacity > 0) llama_batch_free(batch);

    }

  };

  thread_local ThreadLocalBatch tl;

  tl.ensure(n_batch);

  llama_batch& batch = tl.batch;


  // Process tokens in chunks

  int32_t processed = 0;

  while (processed < n_tokens) {

    const int32_t n_eval = std::min(n_tokens - processed, n_batch);


    // Clear batch using llama.cpp common library

    common_batch_clear(batch);


    // Add tokens one by one, mark logits=true only on the final chunk's last token

    const bool is_last_chunk = (processed + n_eval >= n_tokens);

    for (int32_t i = 0; i < n_eval; ++i) {

      const int32_t pos = n_past + i;

      const bool want_logits = is_last_chunk && (i == n_eval - 1);


      // Add token via llama.cpp common library (function-call ABI).

      // {seq_id} constructs a temporary vector per token — acceptable cost

      // vs direct field writes which create struct-layout ABI coupling.

      common_batch_add(batch, tokens[processed + i], pos, {seq_id}, want_logits);

    }


    // Decode chunk (updates KV cache)

    const int rc = llama_decode(ctx, batch);

    if (rc != 0) {

      LLOYAL_LOG_DEBUG(

          "[decode::many] ERROR: llama_decode failed at position %d (rc=%d)",

          n_past, rc);

      return rc;

    }


    n_past += n_eval;

    processed += n_eval;


    LLOYAL_LOG_DEBUG("[decode::many] Processed %d/%d tokens",

                     processed, n_tokens);

  }


  LLOYAL_LOG_DEBUG("[decode::many] Decode complete");

  return 0;

}


[[nodiscard]] inline int many(llama_context *ctx,

                               const std::vector<llama_token> &tokens,

                               int32_t n_past, int32_t n_batch,

                               llama_seq_id seq_id = 0) {

  return many(ctx, tokens.data(), static_cast<int32_t>(tokens.size()), n_past,

              n_batch, seq_id);

}


[[nodiscard]] inline int one(llama_context *ctx, llama_token tok, llama_pos pos,

                              llama_seq_id seq_id = 0, bool want_logits = true) {

  if (!ctx) {

    throw std::runtime_error("decode::one - NULL context");

  }


  struct ThreadLocalBatch {

    llama_batch batch = llama_batch_init(1, 0, 1);

    ~ThreadLocalBatch() { llama_batch_free(batch); }

  };

  thread_local ThreadLocalBatch tl;


  common_batch_clear(tl.batch);

  common_batch_add(tl.batch, tok, pos, {seq_id}, want_logits);


  return llama_decode(ctx, tl.batch);

}


// ============================================================================

// Multi-Sequence Decode

// ============================================================================


struct EachItem {

  llama_token token;

  llama_pos pos;

  llama_seq_id seq_id;

  bool output_logits = false;

};


struct ScatterItem {

  std::span<const llama_token> tokens;

  llama_pos start_pos;

  llama_seq_id seq_id;

  bool output_logits = false;

};


struct Scratch {

  std::vector<llama_token> tokens_;

  std::vector<llama_pos> pos_;

  std::vector<int32_t> n_seq_id_;

  std::vector<llama_seq_id> seq_id_single_;

  std::vector<llama_seq_id*> seq_id_ptrs_;

  std::vector<int8_t> logits_;


  void resize(int32_t n) {

    tokens_.resize(n);

    pos_.resize(n);

    n_seq_id_.resize(n);

    seq_id_single_.resize(n);

    seq_id_ptrs_.resize(n);

    logits_.resize(n);

  }


  llama_batch as_batch(int32_t n_tokens) {

    llama_batch batch{};

    batch.n_tokens = n_tokens;

    batch.token = tokens_.data();

    batch.embd = nullptr;

    batch.pos = pos_.data();

    batch.n_seq_id = n_seq_id_.data();

    batch.seq_id = seq_id_ptrs_.data();

    batch.logits = logits_.data();

    return batch;

  }


};


[[nodiscard]] inline int each(llama_context* ctx,

                               const EachItem* items,

                               int32_t n,

                               Scratch& scratch) {

  if (!ctx) {

    throw std::runtime_error("decode::each - NULL context");

  }

  if (n < 0) {

    throw std::runtime_error("decode::each - negative item count");

  }

  if (n == 0) return 0;


  scratch.resize(n);


  for (int32_t i = 0; i < n; ++i) {

    scratch.tokens_[i] = items[i].token;

    scratch.pos_[i] = items[i].pos;

    scratch.n_seq_id_[i] = 1;

    scratch.seq_id_single_[i] = items[i].seq_id;

    scratch.seq_id_ptrs_[i] = &scratch.seq_id_single_[i];

    scratch.logits_[i] = items[i].output_logits ? int8_t{1} : int8_t{0};

  }


  llama_batch batch = scratch.as_batch(n);


  LLOYAL_LOG_DEBUG("[decode::each] Submitting %d tokens across %d sequences", n, n);


  return llama_decode(ctx, batch);

}


[[nodiscard]] inline int each(llama_context* ctx,

                               const std::vector<EachItem>& items,

                               Scratch& scratch) {

  return each(ctx, items.data(), static_cast<int32_t>(items.size()), scratch);

}


[[nodiscard]] inline int scatter(llama_context* ctx,

                                        const ScatterItem* items,

                                        int32_t n,

                                        Scratch& scratch) {

  if (!ctx) {

    throw std::runtime_error("decode::scatter - NULL context");

  }

  if (n < 0) {

    throw std::runtime_error("decode::scatter - negative item count");

  }


  int32_t total = 0;

  for (int32_t i = 0; i < n; ++i) {

    total += static_cast<int32_t>(items[i].tokens.size());

  }

  if (total == 0) return 0;


  scratch.resize(total);


  int32_t cursor = 0;

  for (int32_t i = 0; i < n; ++i) {

    const auto& item = items[i];

    const llama_pos base_pos = item.start_pos;

    const int32_t item_n = static_cast<int32_t>(item.tokens.size());


    for (int32_t j = 0; j < item_n; ++j) {

      scratch.tokens_[cursor] = item.tokens[j];

      scratch.pos_[cursor] = base_pos + j;

      scratch.n_seq_id_[cursor] = 1;

      scratch.seq_id_single_[cursor] = item.seq_id;

      scratch.seq_id_ptrs_[cursor] = &scratch.seq_id_single_[cursor];


      const bool want_logits =

          item.output_logits ? (j == item_n - 1) : false;

      scratch.logits_[cursor] = want_logits ? int8_t{1} : int8_t{0};


      ++cursor;

    }

  }


  llama_batch batch = scratch.as_batch(total);


  LLOYAL_LOG_DEBUG("[decode::scatter] Submitting %d total tokens across %d sequences", total, n);


  return llama_decode(ctx, batch);

}


[[nodiscard]] inline int scatter(llama_context* ctx,

                                        const std::vector<ScatterItem>& items,

                                        Scratch& scratch) {

  return scatter(ctx, items.data(), static_cast<int32_t>(items.size()), scratch);

}


// ============================================================================

// Bin-Packing Utility

// ============================================================================


struct PackedChunk {

  std::vector<int32_t> indices;

  bool oversized = false;

};


inline std::vector<PackedChunk> bin_pack(

    const std::span<const llama_token>* items,

    int32_t n,

    int32_t n_batch) {


  std::vector<PackedChunk> chunks;

  int32_t chunk_total = 0;


  for (int32_t i = 0; i < n; ++i) {

    int32_t tc = static_cast<int32_t>(items[i].size());

    if (tc == 0) continue;


    if (tc > n_batch) {

      chunks.push_back({{i}, true});

      continue;

    }


    if (chunks.empty() || chunks.back().oversized ||

        chunk_total + tc > n_batch) {

      chunks.push_back({{i}, false});

      chunk_total = tc;

    } else {

      chunks.back().indices.push_back(i);

      chunk_total += tc;

    }

  }


  return chunks;

}


} // namespace lloyal::decode


common.hpp

LLOYAL_LOG_DEBUG
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:48

lloyal::decode
Definition decode.hpp:73

lloyal::decode::bin_pack
std::vector< PackedChunk > bin_pack(const std::span< const llama_token > *items, int32_t n, int32_t n_batch)
Greedy first-fit bin-packing of token spans into n_batch-sized chunks.
Definition decode.hpp:481

lloyal::decode::one
int one(llama_context *ctx, llama_token tok, llama_pos pos, llama_seq_id seq_id=0, bool want_logits=true)
Decode a single token into the KV cache.
Definition decode.hpp:239

lloyal::decode::many
int many(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_past, int32_t n_batch, llama_seq_id seq_id=0)
Decode multiple tokens into the KV cache with auto-chunking.
Definition decode.hpp:125

lloyal::decode::scatter
int scatter(llama_context *ctx, const ScatterItem *items, int32_t n, Scratch &scratch)
Decode multiple tokens per sequence in a single llama_decode() call.
Definition decode.hpp:396

lloyal::decode::each
int each(llama_context *ctx, const EachItem *items, int32_t n, Scratch &scratch)
Decode one token per sequence in a single llama_decode() call.
Definition decode.hpp:340

lloyal::decode::EachItem
Input item for decode::each — one token for one sequence.
Definition decode.hpp:264

lloyal::decode::EachItem::token
llama_token token
Token to decode.
Definition decode.hpp:265

lloyal::decode::EachItem::output_logits
bool output_logits
Whether to compute logits after this token.
Definition decode.hpp:268

lloyal::decode::EachItem::seq_id
llama_seq_id seq_id
Target sequence ID.
Definition decode.hpp:267

lloyal::decode::EachItem::pos
llama_pos pos
KV cache position for this token.
Definition decode.hpp:266

lloyal::decode::PackedChunk
A chunk of item indices produced by bin_pack()
Definition decode.hpp:461

lloyal::decode::PackedChunk::oversized
bool oversized
True → single item exceeding n_batch.
Definition decode.hpp:463

lloyal::decode::PackedChunk::indices
std::vector< int32_t > indices
Indices into the original items array.
Definition decode.hpp:462

lloyal::decode::ScatterItem
Input item for decode::scatter — multiple tokens for one sequence.
Definition decode.hpp:278

lloyal::decode::ScatterItem::start_pos
llama_pos start_pos
KV cache position for first token.
Definition decode.hpp:280

lloyal::decode::ScatterItem::output_logits
bool output_logits
When true, compute logits for last token in this run.
Definition decode.hpp:282

lloyal::decode::ScatterItem::tokens
std::span< const llama_token > tokens
Token array (non-owning view)
Definition decode.hpp:279

lloyal::decode::ScatterItem::seq_id
llama_seq_id seq_id
Target sequence ID.
Definition decode.hpp:281

lloyal::decode::Scratch
Reusable scratch buffers for multi-sequence batch construction.
Definition decode.hpp:291

lloyal::decode::Scratch::seq_id_single_
std::vector< llama_seq_id > seq_id_single_
Definition decode.hpp:295

lloyal::decode::Scratch::tokens_
std::vector< llama_token > tokens_
Definition decode.hpp:292

lloyal::decode::Scratch::n_seq_id_
std::vector< int32_t > n_seq_id_
Definition decode.hpp:294

lloyal::decode::Scratch::logits_
std::vector< int8_t > logits_
Definition decode.hpp:297

lloyal::decode::Scratch::resize
void resize(int32_t n)
Definition decode.hpp:299

lloyal::decode::Scratch::seq_id_ptrs_
std::vector< llama_seq_id * > seq_id_ptrs_
Definition decode.hpp:296

lloyal::decode::Scratch::as_batch
llama_batch as_batch(int32_t n_tokens)
ABI-sensitive: writes llama_batch fields directly (no common_batch_* wrapper exists for external-buff...
Definition decode.hpp:310

lloyal::decode::Scratch::pos_
std::vector< llama_pos > pos_
Definition decode.hpp:293