liblloyal 1.0.0
Composable primitives for llama.cpp inference
Loading...
Searching...
No Matches
/home/runner/work/liblloyal/liblloyal/include/lloyal/kv.hpp

Clear KV cache and reconstruct with anchor + tail tokens.

Clear KV cache and reconstruct with anchor + tail tokensReconstructs KV cache with contiguous positions by:

  1. Clearing entire KV cache
  2. Re-decoding original_sinks (anchor tokens) at position 0
  3. Re-decoding tail (recent tokens) at position sinks.size()

This maintains contiguous positions [0,1,2,...] which is simpler and more reliable than selective removal with position gaps.

Parameters
ctxLlama context (must not be null)
original_sinksAnchor tokens from sequence start (typically 4)
tailRecent tokens to preserve (typically 252, total 256 with sinks)
n_batchBatch size for re-decoding chunks
Exceptions
std::runtime_errorif parameters are invalid or re-decode fails
Warning
CRITICAL: original_sinks MUST be the ORIGINAL first N tokens from sequence start. Reusing different "first N" tokens on each reseed will degrade quality for attention-sink patterns.
Note
After calling, KV cache position = sinks.size() + tail.size() Continue generation with n_past = static_cast<int32_t>(sinks.size() + tail.size())

// Capture original anchor tokens once std::vector<llama_token> SINKS(tokens.begin(), tokens.begin() + 4);

// Each compression: reuse SAME anchors with current tail auto tail = std::vector<llama_token>(tokens.end() - 252, tokens.end()); kv::clear_and_reseed(ctx, SINKS, tail, n_batch);

#pragma once
// SPDX-License-Identifier: Apache-2.0
// Copyright 2026 Lloyal Labs
#include "common.hpp"
#include "decoder.hpp"
#include <cstdint>
#include <llama/llama.h>
#include <vector>
namespace lloyal::kv {
// ===== KV SEQUENCE OPERATIONS =====
inline bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0,
llama_pos p1) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[kv::remove_range] ERROR: null context");
return false;
}
llama_memory_t mem = llama_get_memory(ctx);
bool success = llama_memory_seq_rm(mem, seq, p0, p1);
if (!success) {
LLOYAL_LOG_DEBUG("[kv::remove_range] FAILED: seq=%d, p0=%d, p1=%d", seq, p0,
p1);
LLOYAL_LOG_DEBUG("[kv::remove_range] Guard-rail reminder: Ensure "
"remove_range called BEFORE next llama_decode()");
} else {
LLOYAL_LOG_DEBUG("[kv::remove_range] OK: seq=%d, removed tokens [%d, %d)",
seq, p0, p1);
}
return success;
}
inline llama_pos pos_max(llama_context *ctx, llama_seq_id seq) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[kv::pos_max] ERROR: null context");
return -1;
}
llama_memory_t mem = llama_get_memory(ctx);
llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
LLOYAL_LOG_DEBUG("[kv::pos_max] seq=%d, max_pos=%d", seq, max_pos);
return max_pos;
}
inline void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst,
llama_pos p0 = 0, llama_pos p1 = -1) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[kv::seq_cp] ERROR: null context");
return;
}
llama_memory_t mem = llama_get_memory(ctx);
llama_memory_seq_cp(mem, src, dst, p0, p1);
LLOYAL_LOG_DEBUG("[kv::seq_cp] Copied seq %d → %d [%d, %d)", src, dst, p0, p1);
}
inline void seq_keep(llama_context *ctx, llama_seq_id seq) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[kv::seq_keep] ERROR: null context");
return;
}
llama_memory_t mem = llama_get_memory(ctx);
llama_memory_seq_keep(mem, seq);
LLOYAL_LOG_DEBUG("[kv::seq_keep] Kept only seq %d", seq);
}
// ===== STATE SNAPSHOT OPERATIONS =====
inline size_t state_size(llama_context *ctx, llama_seq_id seq) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[kv::state_size] ERROR: null context");
return 0;
}
llama_memory_t mem = llama_get_memory(ctx);
llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
if (max_pos < 0) {
LLOYAL_LOG_DEBUG("[kv::state_size] WARNING: KV cache is empty (max_pos=%d) "
"- returning 0",
max_pos);
return 0;
}
size_t size = llama_state_seq_get_size(ctx, seq);
if (size == 0) {
"[kv::state_size] Per-sequence size query failed for seq=%d", seq);
"[kv::state_size] Attempting global state size (fallback)");
size = llama_state_get_size(ctx);
if (size > 0) {
LLOYAL_LOG_DEBUG("[kv::state_size] Global fallback size: %zu bytes",
size);
} else {
LLOYAL_LOG_DEBUG("[kv::state_size] ERROR: Both per-sequence and global "
"size queries failed");
}
} else {
"[kv::state_size] Per-sequence size for seq=%d: %zu bytes (%.1f MB)",
seq, size, size / 1024.0 / 1024.0);
}
return size;
}
inline size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst,
size_t size) {
if (!ctx || !dst || size == 0) {
"[kv::state_save] ERROR: invalid parameters (ctx=%p, dst=%p, size=%zu)",
ctx, dst, size);
return 0;
}
llama_memory_t mem = llama_get_memory(ctx);
llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
if (max_pos < 0) {
LLOYAL_LOG_DEBUG("[kv::state_save] WARNING: KV cache is empty (max_pos=%d) "
"- skipping save",
max_pos);
return 0;
}
size_t written = llama_state_seq_get_data(ctx, dst, size, seq);
if (written == 0) {
LLOYAL_LOG_DEBUG("[kv::state_save] Per-sequence save failed for seq=%d "
"(possible KV fragmentation)",
seq);
"[kv::state_save] Attempting global state save (fallback)");
written = llama_state_get_data(ctx, dst, size);
if (written > 0) {
"[kv::state_save] Global fallback succeeded: %zu bytes (%.1f MB)",
written, written / 1024.0 / 1024.0);
} else {
"[kv::state_save] ERROR: Both per-sequence and global save failed");
}
} else {
"[kv::state_save] Per-sequence saved %zu bytes (%.1f MB) for seq=%d",
written, written / 1024.0 / 1024.0, seq);
}
return written;
}
inline size_t state_load(llama_context *ctx, llama_seq_id seq,
const uint8_t *src, size_t size) {
if (!ctx || !src || size == 0) {
"[kv::state_load] ERROR: invalid parameters (ctx=%p, src=%p, size=%zu)",
ctx, src, size);
return 0;
}
llama_memory_t mem = llama_get_memory(ctx);
llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
if (max_pos < 0) {
LLOYAL_LOG_DEBUG("[kv::state_load] WARNING: KV cache is empty (max_pos=%d) "
"- loading may crash on recurrent models",
max_pos);
}
size_t read = llama_state_seq_set_data(ctx, src, size, seq);
if (read == 0) {
LLOYAL_LOG_DEBUG("[kv::state_load] Per-sequence restore failed for seq=%d "
"(possible fragmentation)",
seq);
"[kv::state_load] Attempting global state restore (fallback)");
read = llama_state_set_data(ctx, src, size);
if (read > 0) {
"[kv::state_load] Global fallback succeeded: %zu bytes (%.1f MB)",
read, read / 1024.0 / 1024.0);
} else {
LLOYAL_LOG_DEBUG("[kv::state_load] ERROR: Both per-sequence and global "
"restore failed");
}
} else {
"[kv::state_load] Per-sequence loaded %zu bytes (%.1f MB) for seq=%d",
read, read / 1024.0 / 1024.0, seq);
}
return read;
}
// ===== GLOBAL STATE OPERATIONS =====
inline size_t global_state_size(llama_context *ctx) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[kv::global_state_size] ERROR: null context");
return 0;
}
size_t size = llama_state_get_size(ctx);
LLOYAL_LOG_DEBUG("[kv::global_state_size] %zu bytes (%.1f MB)", size,
size / 1024.0 / 1024.0);
return size;
}
inline size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size) {
if (!ctx || !dst || size == 0) {
LLOYAL_LOG_DEBUG("[kv::global_state_save] ERROR: invalid parameters");
return 0;
}
size_t written = llama_state_get_data(ctx, dst, size);
LLOYAL_LOG_DEBUG("[kv::global_state_save] %zu bytes written (%.1f MB)",
written, written / 1024.0 / 1024.0);
return written;
}
inline size_t global_state_load(llama_context *ctx, const uint8_t *src,
size_t size) {
if (!ctx || !src || size == 0) {
LLOYAL_LOG_DEBUG("[kv::global_state_load] ERROR: invalid parameters");
return 0;
}
size_t read = llama_state_set_data(ctx, src, size);
LLOYAL_LOG_DEBUG("[kv::global_state_load] %zu bytes read (%.1f MB)", read,
read / 1024.0 / 1024.0);
return read;
}
// ===== DIAGNOSTICS =====
inline void log_build_info(llama_context *ctx) {
"[kv::build_info] ============================================");
"[kv::build_info] llama.cpp KV Sequence Operations Configuration");
"[kv::build_info] ============================================");
LLOYAL_LOG_DEBUG("[kv::build_info] Version: b6870");
LLOYAL_LOG_DEBUG("[kv::build_info] API naming: llama_memory_seq_*");
"[kv::build_info] Current MVP: n_seq_max=1 (single sequence only)");
if (ctx) {
llama_pos max_pos = pos_max(ctx, 0);
if (max_pos >= 0) {
LLOYAL_LOG_DEBUG("[kv::build_info] Current KV cursor (seq 0): %d tokens",
max_pos);
} else {
LLOYAL_LOG_DEBUG("[kv::build_info] KV cache empty (seq 0)");
}
size_t snapshot_size = state_size(ctx, 0);
if (snapshot_size > 0) {
"[kv::build_info] Estimated snapshot size: %zu bytes (%.1f MB)",
snapshot_size, snapshot_size / 1024.0 / 1024.0);
}
}
"[kv::build_info] Fragmentation fallback: per-sequence → global state");
"[kv::build_info] Critical: Call remove_range() BEFORE llama_decode()");
"[kv::build_info] ============================================");
}
// ===== CACHE CLEARING =====
inline void clear_all(llama_context *ctx) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[kv::clear_all] ERROR: NULL context");
throw std::runtime_error("kv::clear_all - NULL context");
}
LLOYAL_LOG_DEBUG("[kv::clear_all] Clearing KV cache (metadata + data)");
llama_memory_clear(llama_get_memory(ctx), true); // true = clear data buffers too
LLOYAL_LOG_DEBUG("[kv::clear_all] KV cache cleared");
}
inline void clear_metadata(llama_context *ctx) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[kv::clear_metadata] ERROR: NULL context");
throw std::runtime_error("kv::clear_metadata - NULL context");
}
LLOYAL_LOG_DEBUG("[kv::clear_metadata] Clearing KV cache metadata only");
llama_memory_clear(llama_get_memory(ctx), false); // false = keep data buffers
LLOYAL_LOG_DEBUG("[kv::clear_metadata] KV cache metadata cleared");
}
// ===== CONTEXT COMPRESSION =====
inline void clear_and_reseed(llama_context *ctx,
const std::vector<llama_token> &original_sinks,
const std::vector<llama_token> &tail,
int32_t n_batch) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] ERROR: null context");
throw std::runtime_error("kv::clear_and_reseed - NULL context");
}
if (original_sinks.empty() && tail.empty()) {
LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] ERROR: both sinks and tail are empty");
throw std::runtime_error("kv::clear_and_reseed - no tokens to reseed");
}
LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Starting reseed: %zu sinks + %zu tail = %zu total",
original_sinks.size(), tail.size(), original_sinks.size() + tail.size());
// Get memory handle
llama_memory_t mem = llama_get_memory(ctx);
// Log state before clear
llama_pos max_pos_before = llama_memory_seq_pos_max(mem, 0);
LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Before clear: KV cache max_pos=%d", max_pos_before);
// Clear entire KV cache (simple and reliable)
llama_memory_clear(mem, true);
llama_pos max_pos_after_clear = llama_memory_seq_pos_max(mem, 0);
if (max_pos_after_clear != -1) {
LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] WARNING: KV cache not empty after clear (max_pos=%d)",
max_pos_after_clear);
}
// Re-decode sinks at position 0
if (!original_sinks.empty()) {
LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Re-decoding %zu sinks at position 0", original_sinks.size());
lloyal::decoder::decode_tokens(ctx, original_sinks, 0, n_batch);
}
// Re-decode tail at position sinks.size()
if (!tail.empty()) {
int32_t tail_start_pos = static_cast<int32_t>(original_sinks.size());
LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Re-decoding %zu tail tokens at position %d",
tail.size(), tail_start_pos);
lloyal::decoder::decode_tokens(ctx, tail, tail_start_pos, n_batch);
}
// Verify final state
llama_pos max_pos_after = llama_memory_seq_pos_max(mem, 0);
int32_t expected_pos = static_cast<int32_t>(original_sinks.size() + tail.size()) - 1;
LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] After reseed: KV cache max_pos=%d (expected %d)",
max_pos_after, expected_pos);
if (max_pos_after != expected_pos) {
LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] WARNING: Unexpected final position (got %d, expected %d)",
max_pos_after, expected_pos);
}
LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Reseed complete");
}
// ===== FILE PERSISTENCE =====
struct FileData {
std::vector<llama_token> tokens;
size_t bytes_read;
};
inline size_t write_file(llama_context *ctx, llama_seq_id seq,
const std::string &filepath,
const std::vector<llama_token> &tokens) {
if (!ctx) {
LLOYAL_LOG_DEBUG("[kv::write_file] ERROR: null context");
return 0;
}
if (filepath.empty()) {
LLOYAL_LOG_DEBUG("[kv::write_file] ERROR: empty filepath");
return 0;
}
// Guard: Don't write if KV cache is empty
llama_memory_t mem = llama_get_memory(ctx);
llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
if (max_pos < 0) {
"[kv::write_file] WARNING: KV cache is empty - skipping write");
return 0;
}
// Delegate to llama.cpp's session file writer
// Note: llama.cpp signature is (ctx, filepath, seq_id, tokens, n_tokens)
size_t bytes = llama_state_seq_save_file(ctx, filepath.c_str(), seq,
tokens.data(), tokens.size());
if (bytes > 0) {
LLOYAL_LOG_DEBUG("[kv::write_file] Wrote %s: %zu bytes (%.1f MB), %zu "
"tokens",
filepath.c_str(), bytes, bytes / 1024.0 / 1024.0,
tokens.size());
} else {
LLOYAL_LOG_DEBUG("[kv::write_file] FAILED to write %s", filepath.c_str());
}
return bytes;
}
inline FileData read_file(llama_context *ctx, llama_seq_id seq,
const std::string &filepath) {
if (!ctx) {
throw std::runtime_error("[kv::read_file] null context");
}
if (filepath.empty()) {
throw std::runtime_error("[kv::read_file] empty filepath");
}
// Get model's n_ctx to allocate token buffer
const uint32_t n_ctx = llama_n_ctx(ctx);
std::vector<llama_token> tokens;
tokens.resize(n_ctx); // Allocate buffer with capacity
size_t token_count = 0;
// Note: llama.cpp signature is (ctx, filepath, seq_id, tokens_out, capacity, count_out)
size_t bytes =
llama_state_seq_load_file(ctx, filepath.c_str(), seq, tokens.data(),
tokens.size(), &token_count);
if (bytes == 0) {
throw std::runtime_error("[kv::read_file] failed to load from " +
filepath);
}
tokens.resize(token_count);
LLOYAL_LOG_DEBUG("[kv::read_file] Loaded %s: %zu bytes (%.1f MB), %zu tokens",
filepath.c_str(), bytes, bytes / 1024.0 / 1024.0,
token_count);
return FileData{std::move(tokens), bytes};
}
} // namespace lloyal::kv
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:47
Batch Decoding Operations.
void decode_tokens(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_past, int32_t n_batch, llama_seq_id seq_id=0)
Process tokens through model to update KV cache.
Definition decoder.hpp:127
void seq_keep(llama_context *ctx, llama_seq_id seq)
Keep only one sequence, removing all others.
Definition kv.hpp:138
FileData read_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath)
Definition kv.hpp:684
void log_build_info(llama_context *ctx)
Log KV cache build info and current state.
Definition kv.hpp:405
size_t state_size(llama_context *ctx, llama_seq_id seq)
Get size needed to serialize sequence state.
Definition kv.hpp:165
void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst, llama_pos p0=0, llama_pos p1=-1)
Copy KV cache from one sequence to another.
Definition kv.hpp:114
void clear_and_reseed(llama_context *ctx, const std::vector< llama_token > &original_sinks, const std::vector< llama_token > &tail, int32_t n_batch)
Definition kv.hpp:530
size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst, size_t size)
Save sequence state to buffer.
Definition kv.hpp:220
void clear_metadata(llama_context *ctx)
Clear KV cache metadata only (fast reset)
Definition kv.hpp:485
size_t global_state_load(llama_context *ctx, const uint8_t *src, size_t size)
Restore global state from buffer.
Definition kv.hpp:380
size_t state_load(llama_context *ctx, llama_seq_id seq, const uint8_t *src, size_t size)
Restore sequence state from buffer.
Definition kv.hpp:281
llama_pos pos_max(llama_context *ctx, llama_seq_id seq)
Get maximum position in KV cache sequence.
Definition kv.hpp:87
void clear_all(llama_context *ctx)
Clear all KV cache (complete reset)
Definition kv.hpp:460
size_t global_state_size(llama_context *ctx)
Get size needed to serialize global state.
Definition kv.hpp:336
bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0, llama_pos p1)
Remove token range from KV cache sequence.
Definition kv.hpp:54
size_t write_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath, const std::vector< llama_token > &tokens)
Write KV state to file with self-describing format.
Definition kv.hpp:625
size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size)
Save global state to buffer.
Definition kv.hpp:358
std::vector< llama_token > tokens
Tokens restored from file.
Definition kv.hpp:600
size_t bytes_read
Total bytes read from file.
Definition kv.hpp:601