liblloyal/branch_8hpp_source.html

#pragma once


// SPDX-License-Identifier: LicenseRef-FSL-1.1-Apache-2.0

// Copyright 2026 Lloyal Labs


#include "boundaries.hpp"

#include "common.hpp"

#include "decode.hpp"

#include "grammar.hpp"

#include "kv.hpp"

#include "logits.hpp"

#include "metrics.hpp"

#include "sampler.hpp"


#include <llama/llama.h>

#include <algorithm>  // std::remove

#include <cassert>    // assert

#include <cmath>      // std::exp, std::log, std::isinf, std::isfinite

#include <cstdint>

#include <cstring>    // std::memcpy

#include <ctime>      // std::time

#include <deque>      // std::deque (pointer stability for BranchStore)

#include <functional> // std::function

#include <limits>     // std::numeric_limits

#include <mutex>

#include <span>       // std::span (C++20)

#include <stdexcept>  // std::runtime_error

#include <string>     // std::to_string

#include <utility>    // std::pair, std::exchange

#include <vector>


namespace lloyal::branch {


// ===== HANDLE TYPE =====


using BranchHandle = uint32_t;


constexpr BranchHandle INVALID_HANDLE = 0;

constexpr llama_seq_id NO_LEASE = kv::NO_LEASE;

constexpr int DEFAULT_N_BATCH = 512;

constexpr uint32_t GEN_SHIFT = 16;

constexpr uint32_t INDEX_MASK = 0xFFFF;


struct KvPressure {

  uint32_t n_ctx;

  uint32_t cells_used;

  uint32_t remaining;

};


inline uint16_t handle_index(BranchHandle h) {

  return static_cast<uint16_t>(h & INDEX_MASK);

}


inline uint16_t handle_generation(BranchHandle h) {

  return static_cast<uint16_t>(h >> GEN_SHIFT);

}


inline BranchHandle make_handle(uint16_t index, uint16_t generation) {

  return (static_cast<uint32_t>(generation) << GEN_SHIFT) | index;

}


// ===== REGISTRY HANDLE TYPES =====


using SamplerChainHandle = int32_t;


using GrammarHandle = int32_t;


using MetricsHandle = int32_t;


struct SamplerChainEntry {

  llama_sampler* chain = nullptr;

  bool has_dist = false;


  SamplerChainEntry() = default;

  ~SamplerChainEntry() { if (chain) sampler::free_chain(chain); }


  SamplerChainEntry(SamplerChainEntry&& o) noexcept

      : chain(o.chain), has_dist(o.has_dist) { o.chain = nullptr; }


  SamplerChainEntry& operator=(SamplerChainEntry&& o) noexcept {

    if (this != &o) {

      if (chain) sampler::free_chain(chain);

      chain = o.chain; has_dist = o.has_dist; o.chain = nullptr;

    }

    return *this;

  }


  SamplerChainEntry(const SamplerChainEntry&) = delete;

  SamplerChainEntry& operator=(const SamplerChainEntry&) = delete;

};


struct GrammarEntry {

  llama_sampler* sampler = nullptr;


  GrammarEntry() = default;

  ~GrammarEntry() { if (sampler) grammar::free_sampler(sampler); }


  GrammarEntry(GrammarEntry&& o) noexcept : sampler(o.sampler) { o.sampler = nullptr; }


  GrammarEntry& operator=(GrammarEntry&& o) noexcept {

    if (this != &o) {

      if (sampler) grammar::free_sampler(sampler);

      sampler = o.sampler; o.sampler = nullptr;

    }

    return *this;

  }


  GrammarEntry(const GrammarEntry&) = delete;

  GrammarEntry& operator=(const GrammarEntry&) = delete;

};


struct CachedSamplingParams {

  float temperature = 0.8f;

  int32_t top_k = 40;

  float top_p = 0.95f;

  float typical_p = 1.0f;

  float min_p = 0.05f;

  float penalty_repeat = 1.0f;

  float penalty_freq = 0.0f;

  float penalty_present = 0.0f;

  int32_t penalty_last_n = 64;

  uint32_t seed = 0;

  bool operator==(const CachedSamplingParams&) const = default;

};


template <SamplingParamsLike P>


inline CachedSamplingParams snapshot_params(const P& p) {

  using ::lloyal::detail::as_value;

  return CachedSamplingParams{

    as_value(p.temperature, 0.8f),

    as_value(p.top_k, static_cast<int32_t>(40)),

    as_value(p.top_p, 0.95f),

    as_value(p.typical_p, 1.0f),

    as_value(p.min_p, 0.05f),

    as_value(p.penalty_repeat, 1.0f),

    as_value(p.penalty_freq, 0.0f),

    as_value(p.penalty_present, 0.0f),

    as_value(p.penalty_last_n, static_cast<int32_t>(64)),

    as_value(p.seed, static_cast<uint32_t>(0)),

  };

}


// ===== BRANCH STATE =====


struct BranchState {

  llama_context* ctx = nullptr;

  const llama_model* model = nullptr;


  llama_seq_id seq_id = NO_LEASE;

  llama_pos position = 0;

  llama_pos fork_head = 0;


  SamplerChainHandle sampler_chain = 0;

  GrammarHandle grammar = 0;


  CachedSamplingParams cached_params;


  boundaries::BoundaryTracker* boundary_tracker = nullptr;


  std::vector<llama_logit_bias> logit_bias;

  std::function<void(llama_token_data_array&)> steer_fn;


  MetricsHandle metrics = 0;


  llama_token last_token = -1;

  std::vector<llama_token_data> last_candidates;


  std::vector<float> logits_snapshot;

  bool has_logits = false;


  std::vector<llama_token_data> candidates_buffer;


  int n_batch = DEFAULT_N_BATCH;

  int n_vocab = 0;


  uint16_t generation = 0;

  bool in_use = false;


  // Topology — maintained by fork/prune/pruneSubtree

  BranchHandle parent = INVALID_HANDLE;

  std::vector<BranchHandle> children;

};


// ===== BATCHED DECODE ITEM TYPES =====


struct DecodeEachItem {

  BranchHandle handle;

  llama_token token;

};


struct DecodeScatterItem {

  BranchHandle handle;

  std::span<const llama_token> tokens;

};


// ===== BRANCH STORE (HANDLE TABLE) =====


class BranchStore {

public:


  explicit BranchStore(size_t initial_capacity = 16) {

    // Ensure minimum capacity of 2 (slot 0 reserved + at least 1 usable)

    if (initial_capacity < 2) {

      initial_capacity = 2;

    }

    slots_.resize(initial_capacity);

    // Slot 0 is reserved (handle 0 = invalid)

    slots_[0].in_use = true;

    slots_[0].generation = 0xFFFF;  // Never valid


    // Initialize freelist with remaining slots

    // NOTE: Use i-- > 1 pattern to avoid size_t underflow

    for (size_t i = initial_capacity; i-- > 1; ) {

      freelist_.push_back(static_cast<uint16_t>(i));

    }

  }


  ~BranchStore() {

    if (tenancy_.ctx != nullptr) {

      LLOYAL_LOG_DEBUG("[BranchStore] WARNING: not drained before destruction");

    }

    for (size_t i = 1; i < slots_.size(); ++i) {

      if (slots_[i].in_use) {

        free_branch_resources(slots_[i]);

      }

    }

  }


  struct Allocation { BranchHandle handle; llama_seq_id seq_id; };


  Allocation allocate() {

    if (tenancy_.ctx == nullptr) return {INVALID_HANDLE, NO_LEASE};  // drained

    llama_seq_id seq = kv::tenancy::acquire(tenancy_);

    if (seq < 0) return {INVALID_HANDLE, NO_LEASE};

    BranchHandle handle = allocate_slot();

    if (handle == INVALID_HANDLE) {

      // Seq was never used — bookkeeping-only return, no KV calls

      kv::tenancy::release(tenancy_, seq);

      return {INVALID_HANDLE, NO_LEASE};

    }

    // Stamp seq_id on slot so release() can evict properly

    BranchState* st = get(handle);

    st->seq_id = seq;

    return {handle, seq};

  }


  void release(BranchHandle handle) {

    if (handle == INVALID_HANDLE) return;

    BranchState* st = get(handle);

    if (!st) return;

    // Eager edge cleanup: remove from parent's children

    if (st->parent != INVALID_HANDLE) {

      BranchState* p = get(st->parent);

      if (p) {

        auto& c = p->children;

        c.erase(std::remove(c.begin(), c.end(), handle), c.end());

      }

    }

    // Subtract unique cells owned by this branch (above fork_head)

    if (st->position > st->fork_head) {

      uint32_t unique = static_cast<uint32_t>(st->position - st->fork_head);

      cells_used_ = (unique <= cells_used_) ? cells_used_ - unique : 0;

    }

    // Evict lease (KV strip + bookkeeping)

    if (st->seq_id != NO_LEASE)

      kv::tenancy::evict(tenancy_, st->seq_id);

    free_branch_resources(*st);

    reset_slot(*st);

    freelist_.push_back(handle_index(handle));


    // All branches released → KV cache empty, reset pressure counter

    if (freelist_.size() == slots_.size() - 1) {

      cells_used_ = 0;

    }

  }


  // ===== TENANCY LIFECYCLE =====


  void init_tenancy(llama_context* ctx) {

    tenancy_ = kv::tenancy::init(ctx, llama_n_seq_max(ctx));

    cells_used_ = 0;

  }


  void drain() {

    if (tenancy_.ctx == nullptr) return;  // idempotent

    kv::tenancy::evict_all(tenancy_);

    for (size_t i = 1; i < slots_.size(); ++i) {

      if (slots_[i].in_use) {

        free_branch_resources(slots_[i]);

        reset_slot(slots_[i]);

      }

    }

    tenancy_.ctx = nullptr;  // marks as drained

    cells_used_ = 0;

  }


  void retainOnly(BranchHandle winner) {

    BranchState* w = get(winner);

    if (!w) throw std::runtime_error("retainOnly: invalid winner handle");

    if (w->seq_id == NO_LEASE) throw std::runtime_error("retainOnly: winner has no lease");

    kv::tenancy::retain(tenancy_, w->seq_id);  // nuclear KV pass

    // Collect losers first — don't mutate while iterating

    std::vector<BranchHandle> losers;

    for (size_t i = 1; i < slots_.size(); ++i) {

      if (!slots_[i].in_use) continue;

      BranchHandle h = make_handle(static_cast<uint16_t>(i), slots_[i].generation);

      if (h == winner) continue;

      losers.push_back(h);

    }

    for (auto h : losers)

      release_slot_only(h);  // CPU only, KV already stripped

    w->parent = INVALID_HANDLE;

    w->fork_head = 0;

    w->children.clear();

    cells_used_ = static_cast<uint32_t>(w->position);

  }


  // ===== TOPOLOGY QUERIES =====


  size_t available() const { return kv::tenancy::available(tenancy_); }


  KvPressure kv_pressure() const {

    if (!tenancy_.ctx) return {0, 0, 0};

    uint32_t n_ctx = static_cast<uint32_t>(llama_n_ctx(tenancy_.ctx));

    uint32_t remaining = (n_ctx > cells_used_) ? n_ctx - cells_used_ : 0;

    return { n_ctx, cells_used_, remaining };

  }


  void add_cells_used(uint32_t n) { cells_used_ += n; }


  BranchHandle parent(BranchHandle h) const {

    const BranchState* st = get(h);

    return st ? st->parent : INVALID_HANDLE;

  }


  llama_pos fork_head(BranchHandle h) const {

    const BranchState* st = get(h);

    return st ? st->fork_head : 0;

  }


  const std::vector<BranchHandle>& children(BranchHandle h) const {

    static const std::vector<BranchHandle> empty;

    const BranchState* st = get(h);

    return st ? st->children : empty;

  }


  bool isLeaf(BranchHandle h) const {

    const BranchState* st = get(h);

    return st ? st->children.empty() : true;

  }


  bool isActive(BranchHandle h) const {

    const BranchState* st = get(h);

    return st ? (st->seq_id != NO_LEASE) : false;

  }


  BranchState* get(BranchHandle handle) {

    if (handle == INVALID_HANDLE) return nullptr;


    uint16_t index = handle_index(handle);

    uint16_t gen = handle_generation(handle);


    // Slot 0 is reserved and never valid for external use

    if (index == 0) return nullptr;


    if (index >= slots_.size()) return nullptr;


    BranchState& slot = slots_[index];

    if (!slot.in_use || slot.generation != gen) {

      return nullptr;

    }


    return &slot;

  }


  const BranchState* get(BranchHandle handle) const {

    return const_cast<BranchStore*>(this)->get(handle);

  }


  // ===== SAMPLER CHAIN REGISTRY =====


  template <SamplingParamsLike P>


  SamplerChainHandle create_sampler(const P& params) {

    SamplerChainHandle h = next_sampler_handle_++;

    SamplerChainEntry entry;

    entry.chain = sampler::create_chain(params);

    float temperature = ::lloyal::detail::as_value(params.temperature, 0.8f);

    entry.has_dist = (temperature > 0.0f);

    sampler_chains_.emplace(h, std::move(entry));

    return h;

  }


  SamplerChainHandle clone_sampler(SamplerChainHandle h) {

    if (h == 0) return 0;

    auto it = sampler_chains_.find(h);

    if (it == sampler_chains_.end()) return 0;

    SamplerChainHandle nh = next_sampler_handle_++;

    SamplerChainEntry entry;

    entry.chain = sampler::clone_chain(it->second.chain);

    entry.has_dist = it->second.has_dist;

    sampler_chains_.emplace(nh, std::move(entry));

    return nh;

  }


  void free_sampler(SamplerChainHandle h) {

    if (h != 0) sampler_chains_.erase(h);

  }


  llama_sampler* get_sampler_chain(SamplerChainHandle h) const {

    if (h == 0) return nullptr;

    auto it = sampler_chains_.find(h);

    return it != sampler_chains_.end() ? it->second.chain : nullptr;

  }


  bool sampler_has_dist(SamplerChainHandle h) const {

    if (h == 0) return false;

    auto it = sampler_chains_.find(h);

    return it != sampler_chains_.end() ? it->second.has_dist : false;

  }


  // ===== GRAMMAR REGISTRY =====


  GrammarHandle create_grammar(const llama_model* model,

                               const char* grammar_str,

                               const char* root = "root") {

    GrammarHandle h = next_grammar_handle_++;

    GrammarEntry entry;

    entry.sampler = grammar::init_sampler(model, grammar_str, root);

    grammars_.emplace(h, std::move(entry));

    return h;

  }


  GrammarHandle create_grammar_lazy(

      const llama_model* model,

      const char* grammar_str,

      const std::vector<std::string>& trigger_patterns,

      const std::vector<llama_token>& trigger_tokens,

      const char* root = "root") {

    GrammarHandle h = next_grammar_handle_++;

    GrammarEntry entry;

    entry.sampler = grammar::init_lazy_sampler(

        model, grammar_str, trigger_patterns, trigger_tokens, root);

    if (!entry.sampler) return 0;

    grammars_.emplace(h, std::move(entry));

    return h;

  }


  GrammarHandle clone_grammar(GrammarHandle h) {

    if (h == 0) return 0;

    auto it = grammars_.find(h);

    if (it == grammars_.end()) return 0;

    GrammarHandle nh = next_grammar_handle_++;

    GrammarEntry entry;

    entry.sampler = grammar::clone_sampler(it->second.sampler);

    grammars_.emplace(nh, std::move(entry));

    return nh;

  }


  void free_grammar(GrammarHandle h) {

    if (h != 0) grammars_.erase(h);

  }


  llama_sampler* get_grammar_sampler(GrammarHandle h) const {

    if (h == 0) return nullptr;

    auto it = grammars_.find(h);

    return it != grammars_.end() ? it->second.sampler : nullptr;

  }


  // ===== METRICS REGISTRY =====


  MetricsHandle create_metrics() {

    MetricsHandle h = next_metrics_handle_++;

    metrics_registry_[h] = metrics::BranchMetricsState{};

    return h;

  }


  MetricsHandle clone_metrics(MetricsHandle h) {

    if (h == 0) return 0;

    auto it = metrics_registry_.find(h);

    if (it == metrics_registry_.end()) return 0;

    MetricsHandle nh = next_metrics_handle_++;

    metrics_registry_[nh] = it->second;

    return nh;

  }


  void free_metrics(MetricsHandle h) {

    if (h != 0) metrics_registry_.erase(h);

  }


  void add_model_surprisal(MetricsHandle h, float surprisal) {

    if (h == 0) return;

    auto it = metrics_registry_.find(h);

    if (it == metrics_registry_.end()) return;

    if (!std::isfinite(surprisal)) return;

    it->second.model.nll_sum_nats += std::max(0.0f, surprisal);

    it->second.model.count++;

  }


  void add_sampling_surprisal(MetricsHandle h, float surprisal) {

    if (h == 0) return;

    auto it = metrics_registry_.find(h);

    if (it == metrics_registry_.end()) return;

    if (!std::isfinite(surprisal)) return;

    it->second.sampling.nll_sum_nats += std::max(0.0f, surprisal);

    it->second.sampling.count++;

  }


  float get_model_ppl(MetricsHandle h) const {

    if (h == 0) return std::numeric_limits<float>::infinity();

    auto it = metrics_registry_.find(h);

    if (it == metrics_registry_.end() || it->second.model.count == 0)

      return std::numeric_limits<float>::infinity();

    return std::exp(it->second.model.nll_sum_nats /

                    static_cast<float>(it->second.model.count));

  }


  float get_sampling_ppl(MetricsHandle h) const {

    if (h == 0) return std::numeric_limits<float>::infinity();

    auto it = metrics_registry_.find(h);

    if (it == metrics_registry_.end() || it->second.sampling.count == 0)

      return std::numeric_limits<float>::infinity();

    return std::exp(it->second.sampling.nll_sum_nats /

                    static_cast<float>(it->second.sampling.count));

  }


  // ===== BATCHED DECODE =====


  void decode_each(std::span<const DecodeEachItem> items) {

    if (items.empty()) return;


    const int32_t n = static_cast<int32_t>(items.size());


    // Validate handles and collect states

    std::vector<BranchState*> states(n);

    for (int32_t i = 0; i < n; ++i) {

      states[i] = get(items[i].handle);

      if (!states[i]) {

        throw std::runtime_error("BranchStore::decode_each - invalid handle at index " + std::to_string(i));

      }

      if (i > 0 && states[i]->ctx != states[0]->ctx) {

        throw std::runtime_error("BranchStore::decode_each - all branches must share the same context");

      }

    }


    // Build EachItem array from branch states

    std::vector<decode::EachItem> decode_items(n);

    for (int32_t i = 0; i < n; ++i) {

      decode_items[i].token = items[i].token;

      decode_items[i].pos = states[i]->position;

      decode_items[i].seq_id = states[i]->seq_id;

      decode_items[i].output_logits = true;

    }


    // Single GPU dispatch

    if (decode::each(states[0]->ctx, decode_items.data(), n, scratch_) != 0) {

      throw std::runtime_error("BranchStore::decode_each - llama_decode failed");

    }


    // Capture logits and update positions

    llama_context* ctx = states[0]->ctx;

    for (int32_t i = 0; i < n; ++i) {

      const float* raw_logits = logits::get(ctx, i);  // throws on null

      if (states[i]->n_vocab <= 0) {

        throw std::runtime_error("BranchStore::decode_each - invalid vocab size at index " + std::to_string(i));

      }

      assert(states[i]->logits_snapshot.size() >= static_cast<size_t>(states[i]->n_vocab));

      std::memcpy(states[i]->logits_snapshot.data(), raw_logits,

                  states[i]->n_vocab * sizeof(float));

      states[i]->has_logits = true;

      states[i]->position += 1;

    }

    cells_used_ += static_cast<uint32_t>(items.size());

  }


  void decode_scatter(std::span<const DecodeScatterItem> items) {

    if (items.empty()) return;


    const int32_t n = static_cast<int32_t>(items.size());


    // Validate handles and collect states

    std::vector<BranchState*> states(n);

    for (int32_t i = 0; i < n; ++i) {

      states[i] = get(items[i].handle);

      if (!states[i]) {

        throw std::runtime_error("BranchStore::decode_scatter - invalid handle at index " + std::to_string(i));

      }

      if (i > 0 && states[i]->ctx != states[0]->ctx) {

        throw std::runtime_error("BranchStore::decode_scatter - all branches must share the same context");

      }

    }


    llama_context* ctx = states[0]->ctx;

    const int32_t batch_limit = static_cast<int32_t>(llama_n_batch(ctx));


    // Build flat token spans — bin_pack skips empties internally

    std::vector<std::span<const llama_token>> spans(n);

    for (int32_t i = 0; i < n; ++i) {

      spans[i] = items[i].tokens;

    }


    auto chunks = decode::bin_pack(spans.data(), n, batch_limit);


    for (const auto& chunk : chunks) {

      if (chunk.oversized) {

        int32_t idx = chunk.indices[0];

        int32_t tc = static_cast<int32_t>(items[idx].tokens.size());


        if (decode::many(ctx, items[idx].tokens.data(), tc,

                         states[idx]->position, batch_limit,

                         states[idx]->seq_id) != 0) {

          throw std::runtime_error("BranchStore::decode_scatter - decode::many failed for oversized item " + std::to_string(idx));

        }


        const float* raw_logits = logits::get(ctx, -1);

        assert(states[idx]->logits_snapshot.size() >= static_cast<size_t>(states[idx]->n_vocab));

        std::memcpy(states[idx]->logits_snapshot.data(), raw_logits,

                    states[idx]->n_vocab * sizeof(float));

        states[idx]->has_logits = true;

        states[idx]->position += tc;

        continue;

      }


      // Normal chunk — build ScatterItems and dispatch

      std::vector<decode::ScatterItem> scatter_items(chunk.indices.size());

      for (size_t k = 0; k < chunk.indices.size(); ++k) {

        int32_t idx = chunk.indices[k];

        scatter_items[k].tokens = items[idx].tokens;

        scatter_items[k].start_pos = states[idx]->position;

        scatter_items[k].seq_id = states[idx]->seq_id;

        scatter_items[k].output_logits = true;

      }


      if (decode::scatter(ctx, scatter_items.data(),

                          static_cast<int32_t>(scatter_items.size()),

                          scratch_) != 0) {

        throw std::runtime_error("BranchStore::decode_scatter - decode::scatter failed");

      }


      // Capture logits for each item in the chunk

      int32_t cursor = 0;

      for (size_t k = 0; k < scatter_items.size(); ++k) {

        int32_t idx = chunk.indices[k];

        int32_t item_n = static_cast<int32_t>(scatter_items[k].tokens.size());


        const float* raw_logits = logits::get(ctx, cursor + item_n - 1);

        assert(states[idx]->logits_snapshot.size() >= static_cast<size_t>(states[idx]->n_vocab));

        std::memcpy(states[idx]->logits_snapshot.data(), raw_logits,

                    states[idx]->n_vocab * sizeof(float));

        states[idx]->has_logits = true;

        states[idx]->position += static_cast<int32_t>(items[idx].tokens.size());


        cursor += item_n;

      }

    }


    // Accumulate total tokens decoded across all items

    for (int32_t i = 0; i < n; ++i) {

      cells_used_ += static_cast<uint32_t>(items[i].tokens.size());

    }

  }


  void merge_logits(

      BranchHandle dst_handle,

      std::span<const BranchHandle> expert_handles,

      float alpha) {

    BranchState* dst = get(dst_handle);

    if (!dst) {

      throw std::runtime_error("BranchStore::merge_logits - invalid dst handle");

    }

    if (!dst->has_logits) {

      throw std::runtime_error("BranchStore::merge_logits - dst has no captured logits");

    }

    if (dst->n_vocab <= 0) {

      throw std::runtime_error("BranchStore::merge_logits - dst n_vocab invalid");

    }


    const int32_t n_vocab = dst->n_vocab;

    float* dst_logits = dst->logits_snapshot.data();


    for (size_t i = 0; i < expert_handles.size(); ++i) {

      BranchState* src = get(expert_handles[i]);

      if (!src) {

        throw std::runtime_error(

            "BranchStore::merge_logits - invalid expert handle at index " +

            std::to_string(i));

      }

      if (!src->has_logits) {

        throw std::runtime_error(

            "BranchStore::merge_logits - expert has no captured logits at index " +

            std::to_string(i));

      }

      if (src->n_vocab != n_vocab) {

        throw std::runtime_error(

            "BranchStore::merge_logits - n_vocab mismatch at index " +

            std::to_string(i));

      }


      const float* src_logits = src->logits_snapshot.data();

      for (int32_t t = 0; t < n_vocab; ++t) {

        dst_logits[t] += alpha * src_logits[t];

      }

    }

  }


private:

  void free_branch_resources(BranchState& slot) {

    if (slot.sampler_chain != 0) {

      free_sampler(slot.sampler_chain);

      slot.sampler_chain = 0;

    }

    if (slot.grammar != 0) {

      free_grammar(slot.grammar);

      slot.grammar = 0;

    }

    if (slot.boundary_tracker) {

      delete slot.boundary_tracker;

      slot.boundary_tracker = nullptr;

    }

    if (slot.metrics != 0) {

      free_metrics(slot.metrics);

      slot.metrics = 0;

    }

  }


  void reset_slot(BranchState& slot) {

    slot.in_use = false;

    slot.generation = static_cast<uint16_t>(slot.generation + 1);  // Prevent ABA

    slot.ctx = nullptr;

    slot.model = nullptr;

    slot.seq_id = NO_LEASE;

    slot.position = 0;

    slot.fork_head = 0;

    slot.sampler_chain = 0;

    slot.grammar = 0;

    slot.metrics = 0;

    slot.cached_params = CachedSamplingParams{};

    slot.last_token = -1;

    slot.last_candidates.clear();

    slot.logits_snapshot.clear();

    slot.has_logits = false;

    slot.logit_bias.clear();

    slot.steer_fn = nullptr;

    slot.candidates_buffer.clear();

    slot.n_batch = DEFAULT_N_BATCH;

    slot.n_vocab = 0;

    slot.parent = INVALID_HANDLE;

    slot.children.clear();

  }


  BranchHandle allocate_slot() {

    if (freelist_.empty()) {

      size_t old_size = slots_.size();

      size_t new_size = old_size * 2;

      if (new_size > INDEX_MASK) {

        new_size = INDEX_MASK + 1;

      }

      if (old_size >= new_size) {

        LLOYAL_LOG_DEBUG("[branch::allocate_slot] Store full, cannot allocate");

        return INVALID_HANDLE;

      }

      slots_.resize(new_size);

      for (size_t i = new_size; i-- > old_size; ) {

        freelist_.push_back(static_cast<uint16_t>(i));

      }

    }

    uint16_t index = freelist_.back();

    freelist_.pop_back();

    BranchState& slot = slots_[index];

    slot.in_use = true;

    return make_handle(index, slot.generation);

  }


  void release_slot_only(BranchHandle handle) {

    if (handle == INVALID_HANDLE) return;

    BranchState* st = get(handle);

    if (!st) return;

    // Eager edge cleanup

    if (st->parent != INVALID_HANDLE) {

      BranchState* p = get(st->parent);

      if (p) {

        auto& c = p->children;

        c.erase(std::remove(c.begin(), c.end(), handle), c.end());

      }

    }

    free_branch_resources(*st);

    reset_slot(*st);

    freelist_.push_back(handle_index(handle));

  }


  std::deque<BranchState> slots_;

  std::vector<uint16_t> freelist_;


  kv::tenancy::State tenancy_;


  uint32_t cells_used_ = 0;


  decode::Scratch scratch_;


  // ===== Handle registries (instance-scoped, not global static) =====


  std::unordered_map<SamplerChainHandle, SamplerChainEntry> sampler_chains_;

  SamplerChainHandle next_sampler_handle_ = 1;


  std::unordered_map<GrammarHandle, GrammarEntry> grammars_;

  GrammarHandle next_grammar_handle_ = 1;


  std::unordered_map<MetricsHandle, metrics::BranchMetricsState> metrics_registry_;

  MetricsHandle next_metrics_handle_ = 1;

};


// ===== BRANCH API =====


template <SamplingParamsLike P>


inline BranchHandle create(

    llama_context* ctx,

    const llama_model* model,

    BranchStore& s,

    llama_pos start_pos,

    const P& params,

    int n_batch = DEFAULT_N_BATCH,

    const char* grammar_str = nullptr,

    boundaries::BoundaryTracker* boundary_tracker = nullptr) {

  if (!ctx || !model) {

    LLOYAL_LOG_DEBUG("[branch::create] NULL ctx or model");

    return INVALID_HANDLE;

  }


  auto [handle, seq_id] = s.allocate();

  if (handle == INVALID_HANDLE) {

    return INVALID_HANDLE;

  }


  BranchState* state = s.get(handle);

  if (!state) {

    s.release(handle);

    return INVALID_HANDLE;

  }


  state->ctx = ctx;

  state->model = model;

  // seq_id already stamped by allocate()

  state->position = start_pos;

  state->n_batch = n_batch;


  const llama_vocab* vocab = llama_model_get_vocab(model);

  state->n_vocab = llama_vocab_n_tokens(vocab);

  state->logits_snapshot.resize(state->n_vocab);

  state->has_logits = false;

  state->candidates_buffer.resize(state->n_vocab);


  state->sampler_chain = s.create_sampler(params);

  state->cached_params = snapshot_params(params);


  if (grammar_str && grammar_str[0] != '\0') {

    state->grammar = s.create_grammar(model, grammar_str);

  }


  state->boundary_tracker = boundary_tracker;

  state->metrics = s.create_metrics();


  LLOYAL_LOG_DEBUG("[branch::create] Created branch handle=%u seq=%d pos=%d",

                   handle, seq_id, start_pos);


  return handle;

}


struct ForkOpts {

  bool clone_logits = true;

};


inline BranchHandle fork(BranchHandle source, BranchStore& s, ForkOpts opts = {}) {

  BranchState* src = s.get(source);

  if (!src) {

    LLOYAL_LOG_DEBUG("[branch::fork] Invalid source handle");

    return INVALID_HANDLE;

  }


  auto [new_handle, new_seq_id] = s.allocate();

  if (new_handle == INVALID_HANDLE) {

    return INVALID_HANDLE;

  }


  BranchState* dst = s.get(new_handle);

  if (!dst) {

    s.release(new_handle);

    return INVALID_HANDLE;

  }


  // Copy basic state

  dst->ctx = src->ctx;

  dst->model = src->model;

  dst->seq_id = new_seq_id;

  dst->position = src->position;

  dst->fork_head = src->position;

  dst->n_batch = src->n_batch;

  dst->n_vocab = src->n_vocab;


#ifndef NDEBUG

  assert(kv::pos_max(src->ctx, new_seq_id) < 0 && "tenancy: acquired seq must be clean");

  assert(dst->parent == INVALID_HANDLE && dst->children.empty() && "fresh slot must have no topology");

#endif


  // Fork KV cache

  kv::seq_cp(src->ctx, src->seq_id, new_seq_id);


  // Record topology

  dst->parent = source;

  src->children.push_back(new_handle);


  // Clone sampler chain

  if (src->sampler_chain != 0) {

    dst->sampler_chain = s.clone_sampler(src->sampler_chain);

  }

  dst->cached_params = src->cached_params;


  if (src->grammar != 0) {

    dst->grammar = s.clone_grammar(src->grammar);

  }


  if (src->boundary_tracker) {

    dst->boundary_tracker = src->boundary_tracker->clone().release();

  }


  if (src->metrics != 0) {

    dst->metrics = s.clone_metrics(src->metrics);

  }


  dst->last_token = src->last_token;

  dst->last_candidates = src->last_candidates;


  // Logits snapshot: default preserves fork's "sample same distribution" contract.

  // opts.clone_logits=false skips the ~600KB memcpy from src for prefill-overwrite

  // consumers (rerank leaves, embedding probes). Buffer is still kept sized to

  // n_vocab so subsequent decode_scatter/prefill can write into it.

  if (opts.clone_logits) {

    dst->logits_snapshot = src->logits_snapshot;

    dst->has_logits = src->has_logits;

  } else {

    if (static_cast<int32_t>(dst->logits_snapshot.size()) != dst->n_vocab) {

      dst->logits_snapshot.resize(dst->n_vocab);

    }

    dst->has_logits = false;

  }

  dst->logit_bias = src->logit_bias;


  dst->candidates_buffer.resize(dst->n_vocab);


  LLOYAL_LOG_DEBUG("[branch::fork] Forked handle=%u -> handle=%u seq=%d->%d",

                   source, new_handle, src->seq_id, new_seq_id);


  return new_handle;

}


inline void set_logit_bias(

    BranchHandle handle,

    const llama_logit_bias* biases,

    size_t n_biases,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("set_logit_bias: invalid branch handle");

  }


  // Replace existing biases with new set

  state->logit_bias.assign(biases, biases + n_biases);


  LLOYAL_LOG_DEBUG("[branch::set_logit_bias] Set %zu biases on handle=%u",

                   n_biases, handle);

}


inline void clear_logit_bias(

    BranchHandle handle,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("clear_logit_bias: invalid branch handle");

  }


  state->logit_bias.clear();


  LLOYAL_LOG_DEBUG("[branch::clear_logit_bias] Cleared biases on handle=%u", handle);

}


inline void set_steer(

    BranchHandle handle,

    std::function<void(llama_token_data_array&)> steer_fn,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("set_steer: invalid branch handle");

  }


  state->steer_fn = std::move(steer_fn);


  LLOYAL_LOG_DEBUG("[branch::set_steer] Set steer callback on handle=%u", handle);

}


inline void clear_steer(

    BranchHandle handle,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("clear_steer: invalid branch handle");

  }


  state->steer_fn = nullptr;


  LLOYAL_LOG_DEBUG("[branch::clear_steer] Cleared steer callback on handle=%u", handle);

}


template <SamplingParamsLike P>


inline void set_sampler_params(BranchHandle handle, const P& params, BranchStore& s) {

  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("set_sampler_params: invalid branch handle");

  }


  CachedSamplingParams new_params = snapshot_params(params);

  if (new_params == state->cached_params && state->sampler_chain != 0) {

    return;  // Memoized — no rebuild needed

  }


  // Free old chain

  if (state->sampler_chain != 0) {

    s.free_sampler(state->sampler_chain);

  }


  // Create new chain

  state->sampler_chain = s.create_sampler(params);

  state->cached_params = new_params;


  LLOYAL_LOG_DEBUG("[branch::set_sampler_params] Rebuilt chain on handle=%u temp=%.3f",

                   handle, new_params.temperature);

}


inline void set_grammar(

    BranchHandle handle,

    const llama_model* model,

    const char* grammar_str,

    BranchStore& s) {

  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("set_grammar: invalid branch handle");

  }


  // Free old grammar

  if (state->grammar != 0) {

    s.free_grammar(state->grammar);

    state->grammar = 0;

  }


  // Create new grammar if provided

  if (grammar_str && grammar_str[0] != '\0') {

    state->grammar = s.create_grammar(model, grammar_str);

  }


  LLOYAL_LOG_DEBUG("[branch::set_grammar] %s grammar on handle=%u",

                   state->grammar != 0 ? "Set" : "Cleared", handle);

}


inline void set_grammar_lazy(

    BranchHandle handle,

    const llama_model* model,

    const char* grammar_str,

    const std::vector<std::string>& trigger_patterns,

    const std::vector<llama_token>& trigger_tokens,

    BranchStore& s) {

  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("set_grammar_lazy: invalid branch handle");

  }


  if (state->grammar != 0) {

    s.free_grammar(state->grammar);

    state->grammar = 0;

  }


  if (grammar_str && grammar_str[0] != '\0') {

    state->grammar = s.create_grammar_lazy(

        model, grammar_str, trigger_patterns, trigger_tokens);

  }


  LLOYAL_LOG_DEBUG("[branch::set_grammar_lazy] %s grammar on handle=%u",

                   state->grammar != 0 ? "Set" : "Cleared", handle);

}


inline void prune(BranchHandle handle, BranchStore& s) {

  BranchState* state = s.get(handle);

  if (!state) return;

  if (!state->children.empty())

    throw std::runtime_error("prune: RESTRICT — branch has children. Use pruneSubtree() for CASCADE.");

  s.release(handle);

}


inline void pruneSubtree(BranchHandle h, BranchStore& s) {

  std::vector<BranchHandle> stack{h}, post_order;

  while (!stack.empty()) {

    BranchHandle cur = stack.back(); stack.pop_back();

    post_order.push_back(cur);

    BranchState* st = s.get(cur);

    if (st) for (auto child : st->children) stack.push_back(child);

  }

  for (auto it = post_order.rbegin(); it != post_order.rend(); ++it)

    prune(*it, s);

}


inline void force_snapshot_logits(BranchHandle handle, BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("force_snapshot_logits: invalid branch handle");

  }


  // logits::get() throws if ctx is null or logits unavailable

  const float* raw_logits = logits::get(state->ctx, -1);


  if (state->n_vocab <= 0) {

    throw std::runtime_error("force_snapshot_logits: invalid vocab size");

  }


  std::memcpy(state->logits_snapshot.data(), raw_logits,

              state->n_vocab * sizeof(float));

  state->has_logits = true;

}


inline void prefill(

    BranchHandle handle,

    const llama_token* tokens,

    size_t n_tokens,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("prefill: invalid branch handle");

  }


  // Pass raw pointer directly - no vector copy needed

  if (decode::many(state->ctx, tokens, static_cast<int32_t>(n_tokens),

                   state->position, state->n_batch, state->seq_id) != 0) {

    throw std::runtime_error("prefill: llama_decode failed");

  }


  state->position += static_cast<llama_pos>(n_tokens);

  s.add_cells_used(static_cast<uint32_t>(n_tokens));


  // logits::get() throws if logits unavailable

  const float* raw_logits = logits::get(state->ctx, -1);


  if (state->n_vocab <= 0) {

    throw std::runtime_error("prefill: invalid vocab size");

  }


  std::memcpy(state->logits_snapshot.data(), raw_logits,

              state->n_vocab * sizeof(float));

  state->has_logits = true;

}


inline void step(

    BranchHandle handle,

    llama_token token,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("step: invalid branch handle");

  }


  if (decode::one(state->ctx, token, state->position, state->seq_id, true) != 0) {

    throw std::runtime_error("step: llama_decode failed");

  }

  state->position += 1;

  s.add_cells_used(1);


  // logits::get() throws if logits unavailable

  const float* raw_logits = logits::get(state->ctx, -1);


  if (state->n_vocab <= 0) {

    throw std::runtime_error("step: invalid vocab size");

  }


  std::memcpy(state->logits_snapshot.data(), raw_logits,

              state->n_vocab * sizeof(float));

  state->has_logits = true;

}


inline const float* get_logits(BranchHandle handle, BranchStore& s) {


  const BranchState* state = s.get(handle);

  // Must check has_logits, not just empty() - buffer is pre-allocated in create()

  if (!state || !state->has_logits) {

    return nullptr;

  }


  return state->logits_snapshot.data();

}


inline void set_logits(BranchHandle handle, std::span<const float> data, BranchStore& s) {

  BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("set_logits: invalid handle");

  }

  if (state->n_vocab <= 0) {

    throw std::runtime_error("set_logits: invalid vocab size");

  }

  if (static_cast<int32_t>(data.size()) != state->n_vocab) {

    throw std::runtime_error(

        "set_logits: data length (" + std::to_string(data.size()) +

        ") does not match branch n_vocab (" + std::to_string(state->n_vocab) + ")");

  }


  std::memcpy(state->logits_snapshot.data(), data.data(),

              state->n_vocab * sizeof(float));

  state->has_logits = true;

}


inline void get_logits_at(BranchHandle handle,

                          std::span<const int32_t> indices,

                          std::span<float> out,

                          BranchStore& s) {

  if (indices.size() != out.size()) {

    throw std::runtime_error("get_logits_at: indices/out size mismatch");

  }

  const BranchState* state = s.get(handle);

  if (!state) {

    throw std::runtime_error("get_logits_at: invalid handle");

  }

  if (!state->has_logits) {

    throw std::runtime_error("get_logits_at: no captured logits for handle");

  }

  const int32_t n_vocab = state->n_vocab;

  const float* logits = state->logits_snapshot.data();

  for (size_t i = 0; i < indices.size(); ++i) {

    const int32_t idx = indices[i];

    // Bounds check: indices arrive from JS Int32Array via N-API — must validate

    // against n_vocab to prevent native OOB reads driven by JS-controlled offsets.

    if (idx < 0 || idx >= n_vocab) {

      throw std::runtime_error("get_logits_at: index out of range");

    }

    out[i] = logits[idx];

  }

}


inline llama_token sample(BranchHandle handle, BranchStore& s) {


  BranchState* state = s.get(handle);

  llama_sampler* chain = state ? s.get_sampler_chain(state->sampler_chain) : nullptr;

  if (!state || !chain) {

    return -1;

  }


  // Must have logits captured before sampling

  if (!state->has_logits) {

    LLOYAL_LOG_DEBUG("[branch::sample] No logits captured - call prefill()/step() first");

    return -1;

  }


  // Reuse pre-allocated candidates buffer (avoids O(n_vocab) allocs per sample)

  for (int i = 0; i < state->n_vocab; i++) {

    state->candidates_buffer[i] = llama_token_data{

        static_cast<llama_token>(i),

        state->logits_snapshot[i],

        0.0f};

  }


  llama_token_data_array cur_p = {

      state->candidates_buffer.data(),

      static_cast<size_t>(state->n_vocab),

      -1,

      false};


  // Apply grammar first if present (via anti-corruption layer)

  llama_sampler* gram = s.get_grammar_sampler(state->grammar);

  if (gram) {

    grammar::apply(gram, &cur_p);

  }


  // Apply logit bias if present — O(n_biases) via direct index

  // (candidates are in token-ID order; grammar::apply preserves order)

  if (!state->logit_bias.empty()) {

    for (const auto& bias : state->logit_bias) {

      if (bias.token >= 0 && bias.token < state->n_vocab) {

        cur_p.data[bias.token].logit += bias.bias;

      }

    }

  }


  // Apply steer callback if present

  if (state->steer_fn) {

    try {

      state->steer_fn(cur_p);

    } catch (const std::exception& e) {

      LLOYAL_LOG_DEBUG("[branch::sample] Steer exception: %s", e.what());

      // Continue sampling without steer on exception

    }

  }


  // Apply sampler chain (via anti-corruption layer)

  sampler::apply(chain, &cur_p);


  if (cur_p.selected == -1) {

    return -1;

  }


  llama_token token = cur_p.data[cur_p.selected].id;


  // Capture filtered candidates for sampling metrics

  // After BOTH grammar and sampler chain - this is the actual sampling distribution

  state->last_token = token;

  state->last_candidates.clear();

  state->last_candidates.reserve(cur_p.size);


  for (size_t i = 0; i < cur_p.size; i++) {

    state->last_candidates.push_back(cur_p.data[i]);

  }


  return token;

}


inline void accept_token(

    BranchHandle handle,

    llama_token token,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state) return;


  // Accept in grammar (via anti-corruption layer)

  llama_sampler* gram = s.get_grammar_sampler(state->grammar);

  if (gram) {

    grammar::accept(gram, token);

  }


  // Accept in sampler chain for penalty tracking (via anti-corruption layer)

  llama_sampler* chain = s.get_sampler_chain(state->sampler_chain);

  if (chain) {

    sampler::accept(chain, token);

  }


  // Update model-level perplexity (from raw logits)

  // Guard on has_logits to avoid computing surprisal from zero-filled buffer

  if (state->metrics != 0 && state->has_logits) {

    float ms = metrics::model_surprisal(

        state->logits_snapshot.data(), state->n_vocab, token);

    s.add_model_surprisal(state->metrics, ms);

  }


  // Update sampling-level perplexity (from filtered candidates)

  if (state->metrics != 0 && !state->last_candidates.empty() &&

      token == state->last_token) {

    // Extract filtered logits and IDs from candidates

    std::vector<float> candidate_logits;

    std::vector<int32_t> candidate_ids;

    candidate_logits.reserve(state->last_candidates.size());

    candidate_ids.reserve(state->last_candidates.size());


    for (const auto& cand : state->last_candidates) {

      candidate_logits.push_back(cand.logit);

      candidate_ids.push_back(cand.id);

    }


    // Compute sampling-level surprisal

    float ss = metrics::sampling_surprisal(

        candidate_logits.data(),

        candidate_ids.data(),

        static_cast<int>(candidate_logits.size()),

        token

    );

    s.add_sampling_surprisal(state->metrics, ss);

  }

}


inline void apply_grammar(

    BranchHandle handle,

    float* logits,

    int n_vocab,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  llama_sampler* gram = state ? s.get_grammar_sampler(state->grammar) : nullptr;

  if (!state || !gram) return;


  // Use pre-allocated candidates buffer if size matches, otherwise allocate

  std::vector<llama_token_data>* candidates_ptr;

  std::vector<llama_token_data> temp_buffer;


  if (n_vocab == state->n_vocab && !state->candidates_buffer.empty()) {

    candidates_ptr = &state->candidates_buffer;

  } else {

    temp_buffer.resize(n_vocab);

    candidates_ptr = &temp_buffer;

  }


  auto& candidates = *candidates_ptr;

  for (int i = 0; i < n_vocab; i++) {

    candidates[i] = llama_token_data{

        static_cast<llama_token>(i), logits[i], 0.0f};

  }


  llama_token_data_array cur_p = {

      candidates.data(),

      static_cast<size_t>(n_vocab),

      -1,

      false};


  grammar::apply(gram, &cur_p);


  // Copy masked logits back

  for (int i = 0; i < n_vocab; i++) {

    logits[i] = candidates[i].logit;

  }

}


inline std::vector<std::pair<llama_token, float>> get_legal_priors(

    BranchHandle handle,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state || !state->has_logits) {

    return {};

  }


  // Reuse pre-allocated candidates buffer

  for (int i = 0; i < state->n_vocab; i++) {

    state->candidates_buffer[i] = llama_token_data{

        static_cast<llama_token>(i),

        state->logits_snapshot[i],

        0.0f};

  }


  llama_token_data_array cur_p = {

      state->candidates_buffer.data(),

      static_cast<size_t>(state->n_vocab),

      -1,

      false};


  // Apply grammar to mask illegal tokens

  llama_sampler* gram = s.get_grammar_sampler(state->grammar);

  if (gram) {

    grammar::apply(gram, &cur_p);

  }


  // Collect legal candidates (logit is finite after grammar masking)

  // Grammar masking sets illegal tokens to -INFINITY

  std::vector<std::pair<llama_token, float>> legal_priors;

  float max_logit = -std::numeric_limits<float>::infinity();


  for (size_t i = 0; i < cur_p.size; i++) {

    if (std::isfinite(cur_p.data[i].logit)) {  // Not masked

      legal_priors.emplace_back(cur_p.data[i].id, cur_p.data[i].logit);

      if (cur_p.data[i].logit > max_logit) {

        max_logit = cur_p.data[i].logit;

      }

    }

  }


  if (legal_priors.empty()) {

    return {};

  }


  // Compute softmax over legal moves only (numerically stable)

  float sum_exp = 0.0f;

  for (auto& [token, logit] : legal_priors) {

    float exp_val = std::exp(logit - max_logit);

    logit = exp_val;  // Temporarily store exp value

    sum_exp += exp_val;

  }


  // Normalize to probabilities

  for (auto& [token, prob] : legal_priors) {

    prob /= sum_exp;

  }


  return legal_priors;

}


inline float get_legal_logsumexp(BranchHandle handle, BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state || !state->has_logits) {

    return -std::numeric_limits<float>::infinity();

  }


  // Reuse pre-allocated candidates buffer

  for (int i = 0; i < state->n_vocab; i++) {

    state->candidates_buffer[i] = llama_token_data{

        static_cast<llama_token>(i),

        state->logits_snapshot[i],

        0.0f};

  }


  llama_token_data_array cur_p = {

      state->candidates_buffer.data(),

      static_cast<size_t>(state->n_vocab),

      -1,

      false};


  // Apply grammar to mask illegal tokens

  llama_sampler* gram = s.get_grammar_sampler(state->grammar);

  if (gram) {

    grammar::apply(gram, &cur_p);

  }


  // Numerically stable logsumexp over legal tokens (finite logits only)

  float max_logit = -std::numeric_limits<float>::infinity();

  for (size_t i = 0; i < cur_p.size; i++) {

    if (std::isfinite(cur_p.data[i].logit) && cur_p.data[i].logit > max_logit) {

      max_logit = cur_p.data[i].logit;

    }

  }


  if (!std::isfinite(max_logit)) {

    return -std::numeric_limits<float>::infinity();  // No legal tokens

  }


  float sum_exp = 0.0f;

  for (size_t i = 0; i < cur_p.size; i++) {

    if (std::isfinite(cur_p.data[i].logit)) {

      sum_exp += std::exp(cur_p.data[i].logit - max_logit);

    }

  }


  return max_logit + std::log(sum_exp);

}


inline bool is_token_legal(

    BranchHandle handle,

    llama_token token,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state || token < 0 || token >= state->n_vocab) {

    return false;

  }


  // No grammar = all tokens legal

  llama_sampler* gram = s.get_grammar_sampler(state->grammar);

  if (!gram) {

    return true;

  }


  // Build 1-element candidate array (stack allocated, no heap)

  llama_token_data single_candidate = {

      token,

      state->has_logits ? state->logits_snapshot[token] : 0.0f,

      0.0f

  };


  llama_token_data_array cur_p = {

      &single_candidate,

      1,

      -1,

      false

  };


  // Apply grammar - will set logit to -INFINITY if illegal

  grammar::apply(gram, &cur_p);


  return std::isfinite(single_candidate.logit);

}


inline float get_token_prior_assume_legal(

    BranchHandle handle,

    llama_token token,

    float logsumexp,

    BranchStore& s) {


  BranchState* state = s.get(handle);

  if (!state || !state->has_logits || token < 0 || token >= state->n_vocab) {

    return 0.0f;

  }


  float logit = state->logits_snapshot[token];

  return std::exp(logit - logsumexp);

}


inline float get_token_prior(

    BranchHandle handle,

    llama_token token,

    float logsumexp,

    BranchStore& s) {

  if (!is_token_legal(handle, token, s)) {

    return 0.0f;

  }

  return get_token_prior_assume_legal(handle, token, logsumexp, s);

}


// ===== STATE ACCESSORS =====


inline llama_pos get_position(BranchHandle handle, BranchStore& s) {


  const BranchState* state = s.get(handle);

  return state ? state->position : -1;

}


inline llama_pos get_fork_head(BranchHandle handle, BranchStore& s) {

  const BranchState* state = s.get(handle);

  return state ? state->fork_head : 0;

}


inline float get_perplexity(BranchHandle handle, BranchStore& s) {


  const BranchState* state = s.get(handle);

  if (!state || state->metrics == 0) {

    return std::numeric_limits<float>::infinity();

  }

  return s.get_model_ppl(state->metrics);

}


inline float get_sampling_perplexity(BranchHandle handle, BranchStore& s) {


  const BranchState* state = s.get(handle);

  if (!state || state->metrics == 0) {

    return std::numeric_limits<float>::infinity();

  }

  return s.get_sampling_ppl(state->metrics);

}


inline float get_last_sampling_prior(BranchHandle handle, BranchStore& s) {


  const BranchState* state = s.get(handle);


  if (!state || state->last_candidates.empty() || state->last_token < 0) {

    return 0.0f;

  }


  // Extract candidates

  std::vector<float> candidate_logits;

  std::vector<int32_t> candidate_ids;

  candidate_logits.reserve(state->last_candidates.size());

  candidate_ids.reserve(state->last_candidates.size());


  for (const auto& cand : state->last_candidates) {

    candidate_logits.push_back(cand.logit);

    candidate_ids.push_back(cand.id);

  }


  // Compute surprisal from filtered distribution

  float surprisal = metrics::sampling_surprisal(

      candidate_logits.data(),

      candidate_ids.data(),

      static_cast<int>(candidate_logits.size()),

      state->last_token

  );


  // Convert to probability: P = exp(-surprisal)

  return std::exp(-surprisal);

}


inline int get_n_vocab(BranchHandle handle, BranchStore& s) {


  const BranchState* state = s.get(handle);

  return state ? state->n_vocab : 0;

}


// ===== RAII WRAPPER =====


class Branch {

public:

  Branch() : store_(nullptr), handle_(INVALID_HANDLE) {}


  Branch(BranchStore* store, BranchHandle handle)

      : store_(store), handle_(handle) {}


  ~Branch() {

    if (handle_ != INVALID_HANDLE && store_) {

      branch::pruneSubtree(handle_, *store_);

    }

  }


  Branch(Branch&& other) noexcept

      : store_(other.store_), handle_(other.handle_) {

    other.handle_ = INVALID_HANDLE;

  }


  Branch& operator=(Branch&& other) noexcept {

    if (this != &other) {

      if (handle_ != INVALID_HANDLE && store_) {

        branch::pruneSubtree(handle_, *store_);

      }

      store_ = other.store_;

      handle_ = other.handle_;

      other.handle_ = INVALID_HANDLE;

    }

    return *this;

  }


  Branch(const Branch&) = delete;

  Branch& operator=(const Branch&) = delete;


  template <SamplingParamsLike P>


  static Branch create(

      llama_context* ctx,

      const llama_model* model,

      BranchStore& store,

      llama_pos start_pos,

      const P& params,

      int n_batch = DEFAULT_N_BATCH,

      const char* grammar_str = nullptr,

      boundaries::BoundaryTracker* boundary_tracker = nullptr) {

    BranchHandle h = branch::create(ctx, model, store, start_pos, params, n_batch, grammar_str, boundary_tracker);

    return Branch(&store, h);

  }


  Branch fork() {

    BranchHandle h = branch::fork(handle_, *store_);

    return Branch(store_, h);

  }


  void prune() {

    branch::prune(handle_, *store_);

    handle_ = INVALID_HANDLE;

  }


  void pruneSubtree() {

    branch::pruneSubtree(handle_, *store_);

    handle_ = INVALID_HANDLE;

  }


  void force_snapshot_logits() {

    branch::force_snapshot_logits(handle_, *store_);

  }


  void prefill(const llama_token* tokens, size_t n) {

    branch::prefill(handle_, tokens, n, *store_);

  }


  void step(llama_token token) {

    branch::step(handle_, token, *store_);

  }


  const float* logits() const {

    return branch::get_logits(handle_, *store_);

  }


  llama_token sample() {

    return branch::sample(handle_, *store_);

  }


  void accept(llama_token token) {

    branch::accept_token(handle_, token, *store_);

  }


  bool is_eog(llama_token token) const {

    const BranchState* st = store_ ? store_->get(handle_) : nullptr;

    return st && st->model ? tokenizer::is_eog(st->model, token) : false;

  }


  template <SamplingParamsLike P>


  void setSamplerParams(const P& params) {

    branch::set_sampler_params(handle_, params, *store_);

  }


  void setGrammar(const char* grammar_str) {

    const BranchState* st = store_ ? store_->get(handle_) : nullptr;

    branch::set_grammar(handle_, st ? st->model : nullptr, grammar_str, *store_);

  }


  // ===== ACCESSORS =====


  llama_pos position() const { return branch::get_position(handle_, *store_); }

  llama_pos forkHead() const { return branch::get_fork_head(handle_, *store_); }

  float perplexity() const { return branch::get_perplexity(handle_, *store_); }

  int n_vocab() const { return branch::get_n_vocab(handle_, *store_); }

  bool valid() const { return handle_ != INVALID_HANDLE; }

  BranchHandle handle() const { return handle_; }


  // ===== TOPOLOGY =====


  BranchHandle parentHandle() const { return store_ ? store_->parent(handle_) : INVALID_HANDLE; }


  const std::vector<BranchHandle>& childHandles() const {

    static const std::vector<BranchHandle> empty;

    return store_ ? store_->children(handle_) : empty;

  }


  bool isLeaf() const { return store_ ? store_->isLeaf(handle_) : true; }

  bool isActive() const { return store_ ? store_->isActive(handle_) : false; }


private:

  BranchStore* store_;

  BranchHandle handle_;

};


}  // namespace lloyal::branch


boundaries.hpp

lloyal::boundaries::BoundaryTracker
Stub BoundaryTracker - does nothing.
Definition boundaries.hpp:25

lloyal::branch::BranchStore
Handle table and batched decode orchestrator for branch management.
Definition branch.hpp:392

lloyal::branch::BranchStore::create_grammar_lazy
GrammarHandle create_grammar_lazy(const llama_model *model, const char *grammar_str, const std::vector< std::string > &trigger_patterns, const std::vector< llama_token > &trigger_tokens, const char *root="root")
Create a lazy grammar (unconstrained until trigger fires)
Definition branch.hpp:757

lloyal::branch::BranchStore::get_sampling_ppl
float get_sampling_ppl(MetricsHandle h) const
Get sampling-level perplexity from a metrics tracker.
Definition branch.hpp:888

lloyal::branch::BranchStore::isActive
bool isActive(BranchHandle h) const
Test whether a branch holds a KV lease.
Definition branch.hpp:626

lloyal::branch::BranchStore::free_grammar
void free_grammar(GrammarHandle h)
Free a grammar.
Definition branch.hpp:792

lloyal::branch::BranchStore::create_grammar
GrammarHandle create_grammar(const llama_model *model, const char *grammar_str, const char *root="root")
Create a grammar sampler and register it.
Definition branch.hpp:738

lloyal::branch::BranchStore::clone_metrics
MetricsHandle clone_metrics(MetricsHandle h)
Clone a metrics tracker (for fork)
Definition branch.hpp:824

lloyal::branch::BranchStore::available
size_t available() const
Number of vacant seq_ids available for acquisition.
Definition branch.hpp:561

lloyal::branch::BranchStore::release
void release(BranchHandle handle)
Release a branch slot + evict its KV lease.
Definition branch.hpp:463

lloyal::branch::BranchStore::drain
void drain()
Explicit teardown — evict all leases while context is alive.
Definition branch.hpp:512

lloyal::branch::BranchStore::get_model_ppl
float get_model_ppl(MetricsHandle h) const
Get model-level perplexity from a metrics tracker.
Definition branch.hpp:874

lloyal::branch::BranchStore::isLeaf
bool isLeaf(BranchHandle h) const
Test whether a branch is a leaf (no children)
Definition branch.hpp:616

lloyal::branch::BranchStore::decode_each
void decode_each(std::span< const DecodeEachItem > items)
Decode one token per branch in a single GPU dispatch.
Definition branch.hpp:921

lloyal::branch::BranchStore::sampler_has_dist
bool sampler_has_dist(SamplerChainHandle h) const
Check if a sampler chain ends with dist (stochastic) or greedy.
Definition branch.hpp:723

lloyal::branch::BranchStore::create_metrics
MetricsHandle create_metrics()
Create a metrics tracker and register it.
Definition branch.hpp:813

lloyal::branch::BranchStore::~BranchStore
~BranchStore()
Destructor — frees CPU resources.
Definition branch.hpp:417

lloyal::branch::BranchStore::clone_sampler
SamplerChainHandle clone_sampler(SamplerChainHandle h)
Clone a sampler chain (for fork)
Definition branch.hpp:687

lloyal::branch::BranchStore::fork_head
llama_pos fork_head(BranchHandle h) const
Get a branch's fork head (parent position at fork time)
Definition branch.hpp:595

lloyal::branch::BranchStore::allocate
Allocation allocate()
Allocate a branch slot + KV lease atomically.
Definition branch.hpp:439

lloyal::branch::BranchStore::add_cells_used
void add_cells_used(uint32_t n)
Increment cells_used counter (for standalone prefill/step outside BranchStore methods)
Definition branch.hpp:578

lloyal::branch::BranchStore::get
const BranchState * get(BranchHandle handle) const
Look up branch state by handle.
Definition branch.hpp:660

lloyal::branch::BranchStore::free_sampler
void free_sampler(SamplerChainHandle h)
Free a sampler chain.
Definition branch.hpp:703

lloyal::branch::BranchStore::retainOnly
void retainOnly(BranchHandle winner)
Keep only the winner — nuclear KV + CPU cleanup.
Definition branch.hpp:534

lloyal::branch::BranchStore::kv_pressure
KvPressure kv_pressure() const
KV cache pressure snapshot — O(1), no tree walking.
Definition branch.hpp:570

lloyal::branch::BranchStore::create_sampler
SamplerChainHandle create_sampler(const P &params)
Create a sampler chain and register it.
Definition branch.hpp:672

lloyal::branch::BranchStore::get_grammar_sampler
llama_sampler * get_grammar_sampler(GrammarHandle h) const
Dereference a grammar handle (non-owning)
Definition branch.hpp:801

lloyal::branch::BranchStore::add_sampling_surprisal
void add_sampling_surprisal(MetricsHandle h, float surprisal)
Add sampling-level surprisal to a metrics tracker.
Definition branch.hpp:860

lloyal::branch::BranchStore::init_tenancy
void init_tenancy(llama_context *ctx)
Initialize KV tenancy after context creation.
Definition branch.hpp:499

lloyal::branch::BranchStore::merge_logits
void merge_logits(BranchHandle dst_handle, std::span< const BranchHandle > expert_handles, float alpha)
Merge experts' logits_snapshot into dst's logits_snapshot.
Definition branch.hpp:1102

lloyal::branch::BranchStore::clone_grammar
GrammarHandle clone_grammar(GrammarHandle h)
Clone a grammar (for fork)
Definition branch.hpp:777

lloyal::branch::BranchStore::decode_scatter
void decode_scatter(std::span< const DecodeScatterItem > items)
Decode variable token counts per branch with auto-chunking.
Definition branch.hpp:993

lloyal::branch::BranchStore::BranchStore
BranchStore(size_t initial_capacity=16)
Construct a branch store with initial slot capacity.
Definition branch.hpp:398

lloyal::branch::BranchStore::free_metrics
void free_metrics(MetricsHandle h)
Free a metrics tracker.
Definition branch.hpp:837

lloyal::branch::BranchStore::children
const std::vector< BranchHandle > & children(BranchHandle h) const
Get a branch's child handles.
Definition branch.hpp:605

lloyal::branch::BranchStore::get
BranchState * get(BranchHandle handle)
Look up branch state by handle.
Definition branch.hpp:640

lloyal::branch::BranchStore::parent
BranchHandle parent(BranchHandle h) const
Get a branch's parent handle.
Definition branch.hpp:585

lloyal::branch::BranchStore::add_model_surprisal
void add_model_surprisal(MetricsHandle h, float surprisal)
Add model-level surprisal to a metrics tracker.
Definition branch.hpp:846

lloyal::branch::BranchStore::get_sampler_chain
llama_sampler * get_sampler_chain(SamplerChainHandle h) const
Dereference a sampler chain handle (non-owning)
Definition branch.hpp:712

lloyal::branch::Branch
Definition branch.hpp:2584

lloyal::branch::Branch::~Branch
~Branch()
Destructor — CASCADE prunes entire subtree.
Definition branch.hpp:2592

lloyal::branch::Branch::accept
void accept(llama_token token)
Accept a token — advance grammar, penalty window, and metrics.
Definition branch.hpp:2685

lloyal::branch::Branch::operator=
Branch & operator=(const Branch &)=delete

lloyal::branch::Branch::n_vocab
int n_vocab() const
Vocabulary size.
Definition branch.hpp:2718

lloyal::branch::Branch::setGrammar
void setGrammar(const char *grammar_str)
Replace grammar constraint (nullptr/empty to remove)
Definition branch.hpp:2704

lloyal::branch::Branch::childHandles
const std::vector< BranchHandle > & childHandles() const
Child branch handles (empty if leaf)
Definition branch.hpp:2729

lloyal::branch::Branch::setSamplerParams
void setSamplerParams(const P &params)
Replace sampler chain with new parameters (memoized)
Definition branch.hpp:2698

lloyal::branch::Branch::handle
BranchHandle handle() const
Underlying opaque handle (for interop with free functions)
Definition branch.hpp:2722

lloyal::branch::Branch::logits
const float * logits() const
Get the branch's captured logits snapshot.
Definition branch.hpp:2672

lloyal::branch::Branch::is_eog
bool is_eog(llama_token token) const
Check if a token is end-of-generation for this branch's model.
Definition branch.hpp:2690

lloyal::branch::Branch::parentHandle
BranchHandle parentHandle() const
Parent branch handle, or INVALID_HANDLE if root.
Definition branch.hpp:2727

lloyal::branch::Branch::Branch
Branch(BranchStore *store, BranchHandle handle)
Definition branch.hpp:2588

lloyal::branch::Branch::position
llama_pos position() const
Current decode position (token count)
Definition branch.hpp:2712

lloyal::branch::Branch::sample
llama_token sample()
Sample a token from captured logits.
Definition branch.hpp:2679

lloyal::branch::Branch::step
void step(llama_token token)
Decode one token and capture logits (generation step)
Definition branch.hpp:2666

lloyal::branch::Branch::pruneSubtree
void pruneSubtree()
CASCADE prune — removes entire subtree.
Definition branch.hpp:2646

lloyal::branch::Branch::perplexity
float perplexity() const
Model-level perplexity (from raw logits, pre-filter)
Definition branch.hpp:2716

lloyal::branch::Branch::Branch
Branch(const Branch &)=delete

lloyal::branch::Branch::isActive
bool isActive() const
True if this branch holds a KV lease.
Definition branch.hpp:2736

lloyal::branch::Branch::forkHead
llama_pos forkHead() const
Parent's position at fork time (0 for root branches)
Definition branch.hpp:2714

lloyal::branch::Branch::force_snapshot_logits
void force_snapshot_logits()
Force-copy shared logits buffer into this branch's snapshot.
Definition branch.hpp:2653

lloyal::branch::Branch::isLeaf
bool isLeaf() const
True if this branch has no children.
Definition branch.hpp:2734

lloyal::branch::Branch::prefill
void prefill(const llama_token *tokens, size_t n)
Decode multiple tokens and capture logits atomically (prompt injection)
Definition branch.hpp:2660

lloyal::branch::Branch::Branch
Branch()
Definition branch.hpp:2586

lloyal::branch::Branch::valid
bool valid() const
True if this Branch holds a valid handle.
Definition branch.hpp:2720

lloyal::branch::Branch::prune
void prune()
RESTRICT prune (throws if children exist)
Definition branch.hpp:2640

lloyal::branch::Branch::fork
Branch fork()
Fork: allocates slot + lease, records topology edge.
Definition branch.hpp:2634

lloyal::branch::Branch::operator=
Branch & operator=(Branch &&other) noexcept
Definition branch.hpp:2603

lloyal::branch::Branch::create
static Branch create(llama_context *ctx, const llama_model *model, BranchStore &store, llama_pos start_pos, const P &params, int n_batch=DEFAULT_N_BATCH, const char *grammar_str=nullptr, boundaries::BoundaryTracker *boundary_tracker=nullptr)
Factory: allocates slot + lease from store.
Definition branch.hpp:2620

lloyal::branch::Branch::Branch
Branch(Branch &&other) noexcept
Definition branch.hpp:2598

common.hpp

LLOYAL_LOG_DEBUG
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:48

decode.hpp
Batch Decoding Operations.

grammar.hpp
Grammar-Constrained Sampling.

lloyal::kv::NO_LEASE
constexpr llama_seq_id NO_LEASE
Sentinel value indicating a branch has no KV residency.
Definition kv.hpp:207

kv.hpp
KV Cache Physics.

logits.hpp
Zero-copy logits access with clear lifetime semantics.

metrics.hpp
Distribution Metrics for Test-Time Alignment.

lloyal::branch
Definition branch.hpp:83

lloyal::branch::fork
BranchHandle fork(BranchHandle source, BranchStore &s, ForkOpts opts={})
Fork a branch into a new independent sequence.
Definition branch.hpp:1383

lloyal::branch::prefill
void prefill(BranchHandle handle, const llama_token *tokens, size_t n_tokens, BranchStore &s)
Decode multiple tokens and capture logits atomically (prompt prefill)
Definition branch.hpp:1802

lloyal::branch::set_logit_bias
void set_logit_bias(BranchHandle handle, const llama_logit_bias *biases, size_t n_biases, BranchStore &s)
Definition branch.hpp:1486

lloyal::branch::get_perplexity
float get_perplexity(BranchHandle handle, BranchStore &s)
Get model-level perplexity (from raw logits)
Definition branch.hpp:2482

lloyal::branch::apply_grammar
void apply_grammar(BranchHandle handle, float *logits, int n_vocab, BranchStore &s)
Apply grammar constraints to an external logits buffer.
Definition branch.hpp:2156

lloyal::branch::GrammarHandle
int32_t GrammarHandle
Handle to a grammar sampler in BranchStore's registry (0 = invalid/none)
Definition branch.hpp:154

lloyal::branch::BranchHandle
uint32_t BranchHandle
Opaque handle to a branch slot.
Definition branch.hpp:95

lloyal::branch::step
void step(BranchHandle handle, llama_token token, BranchStore &s)
Decode a single token and capture logits (generation step)
Definition branch.hpp:1847

lloyal::branch::DEFAULT_N_BATCH
constexpr int DEFAULT_N_BATCH
Default batch size for decode operations.
Definition branch.hpp:99

lloyal::branch::set_grammar_lazy
void set_grammar_lazy(BranchHandle handle, const llama_model *model, const char *grammar_str, const std::vector< std::string > &trigger_patterns, const std::vector< llama_token > &trigger_tokens, BranchStore &s)
Set lazy grammar on a branch (unconstrained until trigger fires)
Definition branch.hpp:1682

lloyal::branch::set_logits
void set_logits(BranchHandle handle, std::span< const float > data, BranchStore &s)
Overwrite a branch's logits_snapshot with caller-provided values.
Definition branch.hpp:1916

lloyal::branch::set_steer
void set_steer(BranchHandle handle, std::function< void(llama_token_data_array &)> steer_fn, BranchStore &s)
Definition branch.hpp:1553

lloyal::branch::GEN_SHIFT
constexpr uint32_t GEN_SHIFT
Bit shift for generation field.
Definition branch.hpp:100

lloyal::branch::accept_token
void accept_token(BranchHandle handle, llama_token token, BranchStore &s)
Accept a sampled token, advancing grammar and sampler state.
Definition branch.hpp:2088

lloyal::branch::force_snapshot_logits
void force_snapshot_logits(BranchHandle handle, BranchStore &s)
Force-copy the shared llama.cpp logits buffer into this branch's private snapshot.
Definition branch.hpp:1766

lloyal::branch::get_fork_head
llama_pos get_fork_head(BranchHandle handle, BranchStore &s)
Get the branch's fork head (parent position at fork time)
Definition branch.hpp:2466

lloyal::branch::get_sampling_perplexity
float get_sampling_perplexity(BranchHandle handle, BranchStore &s)
Get sampling-level perplexity (from filtered distribution)
Definition branch.hpp:2502

lloyal::branch::get_legal_logsumexp
float get_legal_logsumexp(BranchHandle handle, BranchStore &s)
Compute log-sum-exp over grammar-legal logits.
Definition branch.hpp:2291

lloyal::branch::clear_logit_bias
void clear_logit_bias(BranchHandle handle, BranchStore &s)
Clear all logit biases from a branch.
Definition branch.hpp:1512

lloyal::branch::handle_generation
uint16_t handle_generation(BranchHandle h)
Extract generation counter from a branch handle.
Definition branch.hpp:134

lloyal::branch::get_logits
const float * get_logits(BranchHandle handle, BranchStore &s)
Get the branch's captured logits snapshot.
Definition branch.hpp:1886

lloyal::branch::set_sampler_params
void set_sampler_params(BranchHandle handle, const P &params, BranchStore &s)
Replace a branch's sampler chain with new parameters.
Definition branch.hpp:1607

lloyal::branch::prune
void prune(BranchHandle handle, BranchStore &s)
Prune a leaf branch (RESTRICT — throws if children exist)
Definition branch.hpp:1718

lloyal::branch::create
BranchHandle create(llama_context *ctx, const llama_model *model, BranchStore &s, llama_pos start_pos, const P &params, int n_batch=DEFAULT_N_BATCH, const char *grammar_str=nullptr, boundaries::BoundaryTracker *boundary_tracker=nullptr)
Create a new branch with sampler chain, optional grammar, and metrics.
Definition branch.hpp:1290

lloyal::branch::snapshot_params
CachedSamplingParams snapshot_params(const P &p)
Snapshot sampling params for memoization comparison.
Definition branch.hpp:237

lloyal::branch::INDEX_MASK
constexpr uint32_t INDEX_MASK
Mask for slot index field.
Definition branch.hpp:101

lloyal::branch::get_n_vocab
int get_n_vocab(BranchHandle handle, BranchStore &s)
Get the branch's vocabulary size.
Definition branch.hpp:2558

lloyal::branch::sample
llama_token sample(BranchHandle handle, BranchStore &s)
Sample a token from the branch's captured logits.
Definition branch.hpp:1996

lloyal::branch::pruneSubtree
void pruneSubtree(BranchHandle h, BranchStore &s)
Prune a branch and all descendants (CASCADE — iterative post-order)
Definition branch.hpp:1735

lloyal::branch::INVALID_HANDLE
constexpr BranchHandle INVALID_HANDLE
Null handle sentinel.
Definition branch.hpp:97

lloyal::branch::get_token_prior
float get_token_prior(BranchHandle handle, llama_token token, float logsumexp, BranchStore &s)
Compute prior probability for a token, checking grammar legality first.
Definition branch.hpp:2434

lloyal::branch::NO_LEASE
constexpr llama_seq_id NO_LEASE
Branch has no KV residency.
Definition branch.hpp:98

lloyal::branch::MetricsHandle
int32_t MetricsHandle
Handle to a metrics tracker in BranchStore's registry (0 = invalid/none)
Definition branch.hpp:157

lloyal::branch::get_position
llama_pos get_position(BranchHandle handle, BranchStore &s)
Get the branch's current decode position.
Definition branch.hpp:2454

lloyal::branch::set_grammar
void set_grammar(BranchHandle handle, const llama_model *model, const char *grammar_str, BranchStore &s)
Replace a branch's grammar constraint.
Definition branch.hpp:1643

lloyal::branch::clear_steer
void clear_steer(BranchHandle handle, BranchStore &s)
Clear the steer callback from a branch.
Definition branch.hpp:1576

lloyal::branch::get_token_prior_assume_legal
float get_token_prior_assume_legal(BranchHandle handle, llama_token token, float logsumexp, BranchStore &s)
Compute prior probability for a token known to be grammar-legal.
Definition branch.hpp:2403

lloyal::branch::make_handle
BranchHandle make_handle(uint16_t index, uint16_t generation)
Construct a branch handle from index and generation.
Definition branch.hpp:144

lloyal::branch::handle_index
uint16_t handle_index(BranchHandle h)
Extract slot index from a branch handle.
Definition branch.hpp:125

lloyal::branch::SamplerChainHandle
int32_t SamplerChainHandle
Handle to a sampler chain in BranchStore's registry (0 = invalid/none)
Definition branch.hpp:151

lloyal::branch::get_legal_priors
std::vector< std::pair< llama_token, float > > get_legal_priors(BranchHandle handle, BranchStore &s)
Get grammar-legal tokens with renormalized probabilities.
Definition branch.hpp:2212

lloyal::branch::is_token_legal
bool is_token_legal(BranchHandle handle, llama_token token, BranchStore &s)
Check if a token is legal under grammar constraints.
Definition branch.hpp:2352

lloyal::branch::get_last_sampling_prior
float get_last_sampling_prior(BranchHandle handle, BranchStore &s)
Get the last sampled token's prior from the filtered distribution.
Definition branch.hpp:2521

lloyal::branch::get_logits_at
void get_logits_at(BranchHandle handle, std::span< const int32_t > indices, std::span< float > out, BranchStore &s)
Read selected logit values from a branch's snapshot.
Definition branch.hpp:1955

lloyal::decode::bin_pack
std::vector< PackedChunk > bin_pack(const std::span< const llama_token > *items, int32_t n, int32_t n_batch)
Greedy first-fit bin-packing of token spans into n_batch-sized chunks.
Definition decode.hpp:481

lloyal::decode::one
int one(llama_context *ctx, llama_token tok, llama_pos pos, llama_seq_id seq_id=0, bool want_logits=true)
Decode a single token into the KV cache.
Definition decode.hpp:239

lloyal::decode::many
int many(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_past, int32_t n_batch, llama_seq_id seq_id=0)
Decode multiple tokens into the KV cache with auto-chunking.
Definition decode.hpp:125

lloyal::decode::scatter
int scatter(llama_context *ctx, const ScatterItem *items, int32_t n, Scratch &scratch)
Decode multiple tokens per sequence in a single llama_decode() call.
Definition decode.hpp:396

lloyal::decode::each
int each(llama_context *ctx, const EachItem *items, int32_t n, Scratch &scratch)
Decode one token per sequence in a single llama_decode() call.
Definition decode.hpp:340

lloyal::detail::as_value
constexpr T as_value(const X &x, T def)
Extract value from either T or std::optional<T> with fallback.
Definition sampler.hpp:51

lloyal::grammar::free_sampler
void free_sampler(llama_sampler *smpl)
Free a grammar sampler.
Definition grammar.hpp:235

lloyal::grammar::clone_sampler
llama_sampler * clone_sampler(llama_sampler *smpl)
Clone a grammar sampler (for fork/branching).
Definition grammar.hpp:213

lloyal::grammar::init_sampler
llama_sampler * init_sampler(const llama_model *model, const std::string &grammar_str, const std::string &root_rule="root")
Initialize a grammar sampler from GBNF grammar string.
Definition grammar.hpp:107

lloyal::grammar::init_lazy_sampler
llama_sampler * init_lazy_sampler(const llama_model *model, const std::string &grammar_str, const std::vector< std::string > &trigger_patterns, const std::vector< llama_token > &trigger_tokens, const std::string &root_rule="root")
Initialize a lazy grammar sampler from GBNF grammar string.
Definition grammar.hpp:158

lloyal::grammar::accept
void accept(llama_sampler *smpl, llama_token token)
Accept a token into grammar state.
Definition grammar.hpp:263

lloyal::grammar::apply
void apply(llama_sampler *smpl, llama_token_data_array *cur_p)
Apply grammar constraint to candidates.
Definition grammar.hpp:249

lloyal::kv::tenancy::evict_all
void evict_all(State &s)
Evict every leased seq_id.
Definition kv.hpp:351

lloyal::kv::tenancy::acquire
llama_seq_id acquire(State &s)
Acquire a seq_id from the vacant pool.
Definition kv.hpp:256

lloyal::kv::tenancy::available
size_t available(const State &s)
Number of vacant seq_ids available for acquisition.
Definition kv.hpp:365

lloyal::kv::tenancy::evict
void evict(State &s, llama_seq_id seq)
Evict a seq_id — strip all KV tags then release.
Definition kv.hpp:297

lloyal::kv::tenancy::retain
void retain(State &s, llama_seq_id keep)
Nuclear retain — keep one seq, rebuild vacancy from scratch.
Definition kv.hpp:320

lloyal::kv::tenancy::init
State init(llama_context *ctx, llama_seq_id n_seq_max)
Initialize tenancy with all seq_ids vacant.
Definition kv.hpp:234

lloyal::kv::tenancy::release
void release(State &s, llama_seq_id seq)
Release a seq_id back to vacant — bookkeeping only, no KV calls.
Definition kv.hpp:275

lloyal::kv::seq_cp
void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst, llama_pos p0=0, llama_pos p1=-1)
Copy KV cache from one sequence to another.
Definition kv.hpp:138

lloyal::kv::pos_max
llama_pos pos_max(llama_context *ctx, llama_seq_id seq)
Get maximum position in KV cache sequence.
Definition kv.hpp:111

lloyal::logits::get
float * get(llama_context *ctx, int32_t index=-1)
Definition logits.hpp:79

lloyal::metrics::sampling_surprisal
float sampling_surprisal(const float *candidate_logits, const int32_t *candidate_ids, int n_candidates, int picked_id, Base base=Base::Nats)
Compute sampling-level surprisal for picked token.
Definition metrics.hpp:227

lloyal::metrics::model_surprisal
float model_surprisal(const float *logits, int n_vocab, int picked_id, Base base=Base::Nats)
Definition metrics.hpp:132

lloyal::sampler::apply
void apply(llama_sampler *chain, llama_token_data_array *cur_p)
Apply a sampler chain to a candidate array.
Definition sampler.hpp:582

lloyal::sampler::accept
void accept(llama_sampler *chain, llama_token token)
Accept a token into the sampler chain.
Definition sampler.hpp:596

lloyal::sampler::clone_chain
llama_sampler * clone_chain(llama_sampler *chain)
Clone a sampler chain.
Definition sampler.hpp:527

lloyal::sampler::create_chain
llama_sampler * create_chain(const P &params)
Create a persistent sampler chain from parameters.
Definition sampler.hpp:466

lloyal::sampler::free_chain
void free_chain(llama_sampler *chain)
Free a sampler chain.
Definition sampler.hpp:568

lloyal::tokenizer::is_eog
bool is_eog(const llama_vocab *vocab, llama_token token)
Check if token is end-of-generation marker.
Definition tokenizer.hpp:217

sampler.hpp
Token Sampling Operations.

lloyal::branch::BranchState
Consolidated mutable state for a single branch.
Definition branch.hpp:275

lloyal::branch::BranchState::in_use
bool in_use
True when slot is allocated to an active branch.
Definition branch.hpp:315

lloyal::branch::BranchState::model
const llama_model * model
Llama model (not owned, must outlive branch)
Definition branch.hpp:277

lloyal::branch::BranchState::has_logits
bool has_logits
True only after force_snapshot_logits(), prefill(), or step()
Definition branch.hpp:299

lloyal::branch::BranchState::steer_fn
std::function< void(llama_token_data_array &)> steer_fn
Dynamic logit callback, NOT cloned on fork.
Definition branch.hpp:291

lloyal::branch::BranchState::n_batch
int n_batch
Batch size for decode operations.
Definition branch.hpp:311

lloyal::branch::BranchState::seq_id
llama_seq_id seq_id
KV cache sequence identifier (NO_LEASE when inactive)
Definition branch.hpp:279

lloyal::branch::BranchState::position
llama_pos position
Current decode position in the sequence.
Definition branch.hpp:280

lloyal::branch::BranchState::logit_bias
std::vector< llama_logit_bias > logit_bias
Static token biases, cloned on fork.
Definition branch.hpp:290

lloyal::branch::BranchState::last_candidates
std::vector< llama_token_data > last_candidates
Filtered candidates from last sample()
Definition branch.hpp:296

lloyal::branch::BranchState::logits_snapshot
std::vector< float > logits_snapshot
Captured logit distribution (n_vocab floats)
Definition branch.hpp:298

lloyal::branch::BranchState::candidates_buffer
std::vector< llama_token_data > candidates_buffer
Reusable scratch buffer for sampling (avoids O(n_vocab) allocs per sample call).
Definition branch.hpp:309

lloyal::branch::BranchState::boundary_tracker
boundaries::BoundaryTracker * boundary_tracker
Token boundary detector (owned, optional)
Definition branch.hpp:288

lloyal::branch::BranchState::sampler_chain
SamplerChainHandle sampler_chain
Handle into BranchStore's sampler registry.
Definition branch.hpp:283

lloyal::branch::BranchState::ctx
llama_context * ctx
Llama context (not owned, must outlive branch)
Definition branch.hpp:276

lloyal::branch::BranchState::parent
BranchHandle parent
Parent branch (INVALID_HANDLE if root)
Definition branch.hpp:318

lloyal::branch::BranchState::n_vocab
int n_vocab
Vocabulary size (cached for buffer pre-allocation)
Definition branch.hpp:312

lloyal::branch::BranchState::grammar
GrammarHandle grammar
Handle into BranchStore's grammar registry.
Definition branch.hpp:284

lloyal::branch::BranchState::children
std::vector< BranchHandle > children
Child branches forked from this one.
Definition branch.hpp:319

lloyal::branch::BranchState::last_token
llama_token last_token
Last token returned by sample()
Definition branch.hpp:295

lloyal::branch::BranchState::metrics
MetricsHandle metrics
Handle into BranchStore's metrics registry.
Definition branch.hpp:293

lloyal::branch::BranchState::fork_head
llama_pos fork_head
Parent's position at fork time (0 for root branches)
Definition branch.hpp:281

lloyal::branch::BranchState::cached_params
CachedSamplingParams cached_params
Params used to create current chain (for memoization)
Definition branch.hpp:286

lloyal::branch::BranchState::generation
uint16_t generation
Slot generation counter (for ABA prevention)
Definition branch.hpp:314

lloyal::branch::BranchStore::Allocation
Result of allocate(): a slot handle + its leased seq_id.
Definition branch.hpp:429

lloyal::branch::BranchStore::Allocation::handle
BranchHandle handle
Definition branch.hpp:429

lloyal::branch::BranchStore::Allocation::seq_id
llama_seq_id seq_id
Definition branch.hpp:429

lloyal::branch::CachedSamplingParams
Concrete sampling params snapshot for memoization.
Definition branch.hpp:215

lloyal::branch::CachedSamplingParams::penalty_present
float penalty_present
Definition branch.hpp:223

lloyal::branch::CachedSamplingParams::typical_p
float typical_p
Definition branch.hpp:219

lloyal::branch::CachedSamplingParams::penalty_freq
float penalty_freq
Definition branch.hpp:222

lloyal::branch::CachedSamplingParams::penalty_repeat
float penalty_repeat
Definition branch.hpp:221

lloyal::branch::CachedSamplingParams::penalty_last_n
int32_t penalty_last_n
Definition branch.hpp:224

lloyal::branch::CachedSamplingParams::top_k
int32_t top_k
Definition branch.hpp:217

lloyal::branch::CachedSamplingParams::min_p
float min_p
Definition branch.hpp:220

lloyal::branch::CachedSamplingParams::temperature
float temperature
Definition branch.hpp:216

lloyal::branch::CachedSamplingParams::top_p
float top_p
Definition branch.hpp:218

lloyal::branch::CachedSamplingParams::operator==
bool operator==(const CachedSamplingParams &) const =default

lloyal::branch::CachedSamplingParams::seed
uint32_t seed
Definition branch.hpp:225

lloyal::branch::DecodeEachItem
Item for decode_each: one token per branch.
Definition branch.hpp:330

lloyal::branch::DecodeEachItem::token
llama_token token
Definition branch.hpp:332

lloyal::branch::DecodeEachItem::handle
BranchHandle handle
Definition branch.hpp:331

lloyal::branch::DecodeScatterItem
Item for decode_scatter: variable tokens per branch.
Definition branch.hpp:348

lloyal::branch::DecodeScatterItem::tokens
std::span< const llama_token > tokens
Definition branch.hpp:350

lloyal::branch::DecodeScatterItem::handle
BranchHandle handle
Definition branch.hpp:349

lloyal::branch::ForkOpts
Options controlling what state fork() clones from source to child.
Definition branch.hpp:1352

lloyal::branch::ForkOpts::clone_logits
bool clone_logits
If true (default), copy src->logits_snapshot to child (~n_vocab*4 bytes, ~600KB for 150k-vocab models...
Definition branch.hpp:1358

lloyal::branch::GrammarEntry
RAII entry for a grammar sampler in the registry.
Definition branch.hpp:190

lloyal::branch::GrammarEntry::GrammarEntry
GrammarEntry()=default

lloyal::branch::GrammarEntry::GrammarEntry
GrammarEntry(GrammarEntry &&o) noexcept
Definition branch.hpp:196

lloyal::branch::GrammarEntry::operator=
GrammarEntry & operator=(const GrammarEntry &)=delete

lloyal::branch::GrammarEntry::~GrammarEntry
~GrammarEntry()
Definition branch.hpp:194

lloyal::branch::GrammarEntry::operator=
GrammarEntry & operator=(GrammarEntry &&o) noexcept
Definition branch.hpp:197

lloyal::branch::GrammarEntry::GrammarEntry
GrammarEntry(const GrammarEntry &)=delete

lloyal::branch::GrammarEntry::sampler
llama_sampler * sampler
Definition branch.hpp:191

lloyal::branch::KvPressure
Snapshot of KV cache pressure from BranchStore.
Definition branch.hpp:114

lloyal::branch::KvPressure::remaining
uint32_t remaining
n_ctx - cells_used (clamped to 0)
Definition branch.hpp:117

lloyal::branch::KvPressure::n_ctx
uint32_t n_ctx
Total KV capacity.
Definition branch.hpp:115

lloyal::branch::KvPressure::cells_used
uint32_t cells_used
Cells allocated since last reset.
Definition branch.hpp:116

lloyal::branch::SamplerChainEntry
RAII entry for a sampler chain in the registry.
Definition branch.hpp:165

lloyal::branch::SamplerChainEntry::~SamplerChainEntry
~SamplerChainEntry()
Definition branch.hpp:170

lloyal::branch::SamplerChainEntry::SamplerChainEntry
SamplerChainEntry()=default

lloyal::branch::SamplerChainEntry::SamplerChainEntry
SamplerChainEntry(const SamplerChainEntry &)=delete

lloyal::branch::SamplerChainEntry::chain
llama_sampler * chain
Definition branch.hpp:166

lloyal::branch::SamplerChainEntry::operator=
SamplerChainEntry & operator=(SamplerChainEntry &&o) noexcept
Definition branch.hpp:174

lloyal::branch::SamplerChainEntry::SamplerChainEntry
SamplerChainEntry(SamplerChainEntry &&o) noexcept
Definition branch.hpp:172

lloyal::branch::SamplerChainEntry::has_dist
bool has_dist
True if chain ends with dist (temp > 0), false if greedy.
Definition branch.hpp:167

lloyal::branch::SamplerChainEntry::operator=
SamplerChainEntry & operator=(const SamplerChainEntry &)=delete

lloyal::kv::tenancy::State::ctx
llama_context * ctx
Context for KV operations (nullptr after drain)
Definition kv.hpp:218

lloyal::metrics::BranchMetricsState
Unified model + sampling perplexity tracker.
Definition metrics.hpp:100