Branch Primitive for Tree Search and Multi-Sequence Generation. More...

#include "boundaries.hpp"
#include "common.hpp"
#include "decode.hpp"
#include "grammar.hpp"
#include "kv.hpp"
#include "logits.hpp"
#include "metrics.hpp"
#include "sampler.hpp"
#include <llama/llama.h>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <ctime>
#include <deque>
#include <functional>
#include <limits>
#include <mutex>
#include <span>
#include <stdexcept>
#include <string>
#include <utility>
#include <vector>

Classes
struct	lloyal::branch::KvPressure
	Snapshot of KV cache pressure from BranchStore. More...

struct	lloyal::branch::SamplerChainEntry
	RAII entry for a sampler chain in the registry. More...

struct	lloyal::branch::GrammarEntry
	RAII entry for a grammar sampler in the registry. More...

struct	lloyal::branch::CachedSamplingParams
	Concrete sampling params snapshot for memoization. More...

struct	lloyal::branch::BranchState
	Consolidated mutable state for a single branch. More...

struct	lloyal::branch::DecodeEachItem
	Item for decode_each: one token per branch. More...

struct	lloyal::branch::DecodeScatterItem
	Item for decode_scatter: variable tokens per branch. More...

class	lloyal::branch::BranchStore
	Handle table and batched decode orchestrator for branch management. More...

struct	lloyal::branch::BranchStore::Allocation
	Result of allocate(): a slot handle + its leased seq_id. More...

class	lloyal::branch::Branch

Namespaces
namespace	lloyal
	Boundary Tracker Stub for OSS liblloyal.

namespace	lloyal::branch

Typedefs
using	lloyal::branch::BranchHandle = uint32_t
	Opaque handle to a branch slot.

using	lloyal::branch::SamplerChainHandle = int32_t
	Handle to a sampler chain in BranchStore's registry (0 = invalid/none)

using	lloyal::branch::GrammarHandle = int32_t
	Handle to a grammar sampler in BranchStore's registry (0 = invalid/none)

using	lloyal::branch::MetricsHandle = int32_t
	Handle to a metrics tracker in BranchStore's registry (0 = invalid/none)

Functions
uint16_t	lloyal::branch::handle_index (BranchHandle h)
	Extract slot index from a branch handle.

uint16_t	lloyal::branch::handle_generation (BranchHandle h)
	Extract generation counter from a branch handle.

BranchHandle	lloyal::branch::make_handle (uint16_t index, uint16_t generation)
	Construct a branch handle from index and generation.

template<SamplingParamsLike P>
CachedSamplingParams	lloyal::branch::snapshot_params (const P &p)
	Snapshot sampling params for memoization comparison.

template<SamplingParamsLike P>
BranchHandle	lloyal::branch::create (llama_context ctx, const llama_model model, BranchStore &s, llama_pos start_pos, const P &params, int n_batch=DEFAULT_N_BATCH, const char grammar_str=nullptr, boundaries::BoundaryTracker boundary_tracker=nullptr)
	Create a new branch with sampler chain, optional grammar, and metrics.

BranchHandle	lloyal::branch::fork (BranchHandle source, BranchStore &s)
	Fork a branch into a new independent sequence.

void	lloyal::branch::set_logit_bias (BranchHandle handle, const llama_logit_bias *biases, size_t n_biases, BranchStore &s)

void	lloyal::branch::clear_logit_bias (BranchHandle handle, BranchStore &s)
	Clear all logit biases from a branch.

void	lloyal::branch::set_steer (BranchHandle handle, std::function< void(llama_token_data_array &)> steer_fn, BranchStore &s)

void	lloyal::branch::clear_steer (BranchHandle handle, BranchStore &s)
	Clear the steer callback from a branch.

template<SamplingParamsLike P>
void	lloyal::branch::set_sampler_params (BranchHandle handle, const P &params, BranchStore &s)
	Replace a branch's sampler chain with new parameters.

void	lloyal::branch::set_grammar (BranchHandle handle, const llama_model model, const char grammar_str, BranchStore &s)
	Replace a branch's grammar constraint.

void	lloyal::branch::set_grammar_lazy (BranchHandle handle, const llama_model model, const char grammar_str, const std::vector< std::string > &trigger_patterns, const std::vector< llama_token > &trigger_tokens, BranchStore &s)
	Set lazy grammar on a branch (unconstrained until trigger fires)

void	lloyal::branch::prune (BranchHandle handle, BranchStore &s)
	Prune a leaf branch (RESTRICT — throws if children exist)

void	lloyal::branch::pruneSubtree (BranchHandle h, BranchStore &s)
	Prune a branch and all descendants (CASCADE — iterative post-order)

void	lloyal::branch::force_snapshot_logits (BranchHandle handle, BranchStore &s)
	Force-copy the shared llama.cpp logits buffer into this branch's private snapshot.

void	lloyal::branch::prefill (BranchHandle handle, const llama_token *tokens, size_t n_tokens, BranchStore &s)
	Decode multiple tokens and capture logits atomically (prompt prefill)

void	lloyal::branch::step (BranchHandle handle, llama_token token, BranchStore &s)
	Decode a single token and capture logits (generation step)

const float *	lloyal::branch::get_logits (BranchHandle handle, BranchStore &s)
	Get the branch's captured logits snapshot.

void	lloyal::branch::set_logits (BranchHandle handle, std::span< const float > data, BranchStore &s)
	Overwrite a branch's logits_snapshot with caller-provided values.

llama_token	lloyal::branch::sample (BranchHandle handle, BranchStore &s)
	Sample a token from the branch's captured logits.

void	lloyal::branch::accept_token (BranchHandle handle, llama_token token, BranchStore &s)
	Accept a sampled token, advancing grammar and sampler state.

void	lloyal::branch::apply_grammar (BranchHandle handle, float *logits, int n_vocab, BranchStore &s)
	Apply grammar constraints to an external logits buffer.

std::vector< std::pair< llama_token, float > >	lloyal::branch::get_legal_priors (BranchHandle handle, BranchStore &s)
	Get grammar-legal tokens with renormalized probabilities.

float	lloyal::branch::get_legal_logsumexp (BranchHandle handle, BranchStore &s)
	Compute log-sum-exp over grammar-legal logits.

bool	lloyal::branch::is_token_legal (BranchHandle handle, llama_token token, BranchStore &s)
	Check if a token is legal under grammar constraints.

float	lloyal::branch::get_token_prior_assume_legal (BranchHandle handle, llama_token token, float logsumexp, BranchStore &s)
	Compute prior probability for a token known to be grammar-legal.

float	lloyal::branch::get_token_prior (BranchHandle handle, llama_token token, float logsumexp, BranchStore &s)
	Compute prior probability for a token, checking grammar legality first.

llama_pos	lloyal::branch::get_position (BranchHandle handle, BranchStore &s)
	Get the branch's current decode position.

llama_pos	lloyal::branch::get_fork_head (BranchHandle handle, BranchStore &s)
	Get the branch's fork head (parent position at fork time)

float	lloyal::branch::get_perplexity (BranchHandle handle, BranchStore &s)
	Get model-level perplexity (from raw logits)

float	lloyal::branch::get_sampling_perplexity (BranchHandle handle, BranchStore &s)
	Get sampling-level perplexity (from filtered distribution)

float	lloyal::branch::get_last_sampling_prior (BranchHandle handle, BranchStore &s)
	Get the last sampled token's prior from the filtered distribution.

int	lloyal::branch::get_n_vocab (BranchHandle handle, BranchStore &s)
	Get the branch's vocabulary size.

Variables
constexpr BranchHandle	lloyal::branch::INVALID_HANDLE = 0
	Null handle sentinel.

constexpr llama_seq_id	lloyal::branch::NO_LEASE = kv::NO_LEASE
	Branch has no KV residency.

constexpr int	lloyal::branch::DEFAULT_N_BATCH = 512
	Default batch size for decode operations.

constexpr uint32_t	lloyal::branch::GEN_SHIFT = 16
	Bit shift for generation field.

constexpr uint32_t	lloyal::branch::INDEX_MASK = 0xFFFF
	Mask for slot index field.

Detailed Description

Branch Primitive for Tree Search and Multi-Sequence Generation.

Consolidates all forkable state into a single handle-based API:

KV cache sequence (via kv::seq_cp)
Sampler chain (penalties, PRNG, top-k/p filters)
Grammar constraints (GBNF parser state)
Metrics (model + sampling perplexity trackers)
Logits snapshot (captured distribution for deferred sampling)
Logit bias (static token-level adjustments)

Handle design:

Handle = (generation << 16) | index
Generation counter prevents ABA bugs on slot reuse
Freelist enables branch pooling without malloc churn

KV cache constraint:

Each live branch occupies one llama_seq_id in the KV cache
Max simultaneous branches = llama_n_seq_max(ctx) (<= 256)
Slot table (65535) > n_seq_max — slots are abundant, leases are scarce
kv::tenancy manages seq_id lifecycle (allocate/evict/retain/drain)

Fork semantics:

fork() clones: KV, grammar, sampler chain, metrics, logits, logit_bias
fork() does NOT clone: steer callback (captures references, unsafe to copy)
Each branch is fully independent after fork

These primitives compose into inference patterns including:

Best-of-N sampling (fork N candidates, select by perplexity)
Speculative decoding (draft/verify with branch fork/prune)
Tree search (expand/backup/select with grammar priors)
Beam search (top-k branches at each step)

Definition in file branch.hpp.

Classes

Namespaces

Typedefs

Functions

Variables

Detailed Description