33#include <llama/llama.h>
45inline ggml_type
from_str(
const std::string& s) {
46 if (s ==
"f32")
return GGML_TYPE_F32;
47 if (s ==
"f16")
return GGML_TYPE_F16;
48 if (s ==
"bf16")
return GGML_TYPE_BF16;
49 if (s ==
"q8_0")
return GGML_TYPE_Q8_0;
50 if (s ==
"q4_0")
return GGML_TYPE_Q4_0;
51 if (s ==
"q4_1")
return GGML_TYPE_Q4_1;
52 if (s ==
"iq4_nl")
return GGML_TYPE_IQ4_NL;
53 if (s ==
"q5_0")
return GGML_TYPE_Q5_0;
54 if (s ==
"q5_1")
return GGML_TYPE_Q5_1;
55 return GGML_TYPE_COUNT;
78inline bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0,
85 llama_memory_t mem = llama_get_memory(ctx);
86 bool success = llama_memory_seq_rm(mem, seq, p0, p1);
92 "remove_range called BEFORE next llama_decode()");
111inline llama_pos
pos_max(llama_context *ctx, llama_seq_id seq) {
117 llama_memory_t mem = llama_get_memory(ctx);
118 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
138inline void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst,
139 llama_pos p0 = 0, llama_pos p1 = -1) {
145 llama_memory_t mem = llama_get_memory(ctx);
146 llama_memory_seq_cp(mem, src, dst, p0, p1);
148 LLOYAL_LOG_DEBUG(
"[kv::seq_cp] Copied seq %d → %d [%d, %d)", src, dst, p0, p1);
162inline void seq_keep(llama_context *ctx, llama_seq_id seq) {
168 llama_memory_t mem = llama_get_memory(ctx);
169 llama_memory_seq_keep(mem, seq);
198static_assert(std::is_signed_v<llama_seq_id>,
199 "llama_seq_id must be signed for NO_LEASE sentinel");
207constexpr llama_seq_id
NO_LEASE =
static_cast<llama_seq_id
>(-1);
218 llama_context*
ctx =
nullptr;
234inline State init(llama_context* ctx, llama_seq_id n_seq_max) {
238 s.
leased.resize(
static_cast<size_t>(n_seq_max), 0);
239 s.
vacant.reserve(
static_cast<size_t>(n_seq_max));
240 for (llama_seq_id i = n_seq_max; i-- > 0; ) {
258 llama_seq_id seq = s.
vacant.back();
260 s.
leased[
static_cast<size_t>(seq)] = 1;
276 assert(seq >= 0 && seq < s.
n_seq_max &&
"release: seq out of range");
277 assert(s.
leased[
static_cast<size_t>(seq)] &&
"release: seq not leased");
278 s.
leased[
static_cast<size_t>(seq)] = 0;
298 assert(seq >= 0 && seq < s.
n_seq_max &&
"evict: seq out of range");
299 assert(s.
leased[
static_cast<size_t>(seq)] &&
"evict: seq not leased");
321 assert(keep >= 0 && keep < s.
n_seq_max &&
"retain: keep seq out of range");
322 assert(s.
leased[
static_cast<size_t>(keep)] &&
"retain: keep seq not leased");
325 for (llama_seq_id i = 0; i < s.
n_seq_max; ++i) {
326 if (i != keep) assert(
pos_max(s.
ctx, i) < 0 &&
"retain: seq_keep left dirty tags");
331 for (llama_seq_id i = s.
n_seq_max; i-- > 0; ) {
333 s.
leased[
static_cast<size_t>(i)] = 1;
335 s.
leased[
static_cast<size_t>(i)] = 0;
352 for (llama_seq_id i = 0; i < s.
n_seq_max; ++i) {
353 if (s.
leased[
static_cast<size_t>(i)]) {
388inline size_t state_size(llama_context *ctx, llama_seq_id seq) {
394 llama_memory_t mem = llama_get_memory(ctx);
395 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
397 LLOYAL_LOG_DEBUG(
"[kv::state_size] WARNING: KV cache is empty (max_pos=%d) "
403 size_t size = llama_state_seq_get_size(ctx, seq);
407 "[kv::state_size] Per-sequence size query failed for seq=%d", seq);
409 "[kv::state_size] Attempting global state size (fallback)");
410 size = llama_state_get_size(ctx);
417 "size queries failed");
421 "[kv::state_size] Per-sequence size for seq=%d: %zu bytes (%.1f MB)",
422 seq, size, size / 1024.0 / 1024.0);
443inline size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst,
445 if (!ctx || !dst || size == 0) {
447 "[kv::state_save] ERROR: invalid parameters (ctx=%p, dst=%p, size=%zu)",
452 llama_memory_t mem = llama_get_memory(ctx);
453 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
455 LLOYAL_LOG_DEBUG(
"[kv::state_save] WARNING: KV cache is empty (max_pos=%d) "
461 size_t written = llama_state_seq_get_data(ctx, dst, size, seq);
465 "(possible KV fragmentation)",
468 "[kv::state_save] Attempting global state save (fallback)");
469 written = llama_state_get_data(ctx, dst, size);
473 "[kv::state_save] Global fallback succeeded: %zu bytes (%.1f MB)",
474 written, written / 1024.0 / 1024.0);
477 "[kv::state_save] ERROR: Both per-sequence and global save failed");
481 "[kv::state_save] Per-sequence saved %zu bytes (%.1f MB) for seq=%d",
482 written, written / 1024.0 / 1024.0, seq);
504inline size_t state_load(llama_context *ctx, llama_seq_id seq,
505 const uint8_t *src,
size_t size) {
506 if (!ctx || !src || size == 0) {
508 "[kv::state_load] ERROR: invalid parameters (ctx=%p, src=%p, size=%zu)",
513 llama_memory_t mem = llama_get_memory(ctx);
514 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
516 LLOYAL_LOG_DEBUG(
"[kv::state_load] WARNING: KV cache is empty (max_pos=%d) "
517 "- loading may crash on recurrent models",
521 size_t read = llama_state_seq_set_data(ctx, src, size, seq);
525 "(possible fragmentation)",
528 "[kv::state_load] Attempting global state restore (fallback)");
529 read = llama_state_set_data(ctx, src, size);
533 "[kv::state_load] Global fallback succeeded: %zu bytes (%.1f MB)",
534 read, read / 1024.0 / 1024.0);
541 "[kv::state_load] Per-sequence loaded %zu bytes (%.1f MB) for seq=%d",
542 read, read / 1024.0 / 1024.0, seq);
565 size_t size = llama_state_get_size(ctx);
567 size / 1024.0 / 1024.0);
582 if (!ctx || !dst || size == 0) {
587 size_t written = llama_state_get_data(ctx, dst, size);
589 written, written / 1024.0 / 1024.0);
605 if (!ctx || !src || size == 0) {
610 size_t read = llama_state_set_data(ctx, src, size);
612 read / 1024.0 / 1024.0);
630 "[kv::build_info] ============================================");
632 "[kv::build_info] llama.cpp KV Sequence Operations Configuration");
634 "[kv::build_info] ============================================");
638 "[kv::build_info] Current MVP: n_seq_max=1 (single sequence only)");
641 llama_pos max_pos =
pos_max(ctx, 0);
650 if (snapshot_size > 0) {
652 "[kv::build_info] Estimated snapshot size: %zu bytes (%.1f MB)",
653 snapshot_size, snapshot_size / 1024.0 / 1024.0);
658 "[kv::build_info] Fragmentation fallback: per-sequence → global state");
660 "[kv::build_info] Critical: Call remove_range() BEFORE llama_decode()");
662 "[kv::build_info] ============================================");
686 throw std::runtime_error(
"kv::clear_all - NULL context");
690 llama_memory_clear(llama_get_memory(ctx),
true);
711 throw std::runtime_error(
"kv::clear_metadata - NULL context");
715 llama_memory_clear(llama_get_memory(ctx),
false);
754 const std::vector<llama_token> &original_sinks,
755 const std::vector<llama_token> &tail,
759 throw std::runtime_error(
"kv::clear_and_reseed - NULL context");
762 if (original_sinks.empty() && tail.empty()) {
763 LLOYAL_LOG_DEBUG(
"[kv::clear_and_reseed] ERROR: both sinks and tail are empty");
764 throw std::runtime_error(
"kv::clear_and_reseed - no tokens to reseed");
767 LLOYAL_LOG_DEBUG(
"[kv::clear_and_reseed] Starting reseed: %zu sinks + %zu tail = %zu total",
768 original_sinks.size(), tail.size(), original_sinks.size() + tail.size());
771 llama_memory_t mem = llama_get_memory(ctx);
774 llama_pos max_pos_before = llama_memory_seq_pos_max(mem, 0);
775 LLOYAL_LOG_DEBUG(
"[kv::clear_and_reseed] Before clear: KV cache max_pos=%d", max_pos_before);
778 llama_memory_clear(mem,
true);
780 llama_pos max_pos_after_clear = llama_memory_seq_pos_max(mem, 0);
781 if (max_pos_after_clear != -1) {
782 LLOYAL_LOG_DEBUG(
"[kv::clear_and_reseed] WARNING: KV cache not empty after clear (max_pos=%d)",
783 max_pos_after_clear);
787 if (!original_sinks.empty()) {
788 LLOYAL_LOG_DEBUG(
"[kv::clear_and_reseed] Re-decoding %zu sinks at position 0", original_sinks.size());
790 throw std::runtime_error(
"kv::clear_and_reseed - llama_decode failed on sinks");
796 int32_t tail_start_pos =
static_cast<int32_t
>(original_sinks.size());
797 LLOYAL_LOG_DEBUG(
"[kv::clear_and_reseed] Re-decoding %zu tail tokens at position %d",
798 tail.size(), tail_start_pos);
800 throw std::runtime_error(
"kv::clear_and_reseed - llama_decode failed on tail");
805 llama_pos max_pos_after = llama_memory_seq_pos_max(mem, 0);
806 int32_t expected_pos =
static_cast<int32_t
>(original_sinks.size() + tail.size()) - 1;
808 LLOYAL_LOG_DEBUG(
"[kv::clear_and_reseed] After reseed: KV cache max_pos=%d (expected %d)",
809 max_pos_after, expected_pos);
811 if (max_pos_after != expected_pos) {
812 LLOYAL_LOG_DEBUG(
"[kv::clear_and_reseed] WARNING: Unexpected final position (got %d, expected %d)",
813 max_pos_after, expected_pos);
852inline size_t write_file(llama_context *ctx, llama_seq_id seq,
853 const std::string &filepath,
854 const std::vector<llama_token> &tokens) {
860 if (filepath.empty()) {
866 llama_memory_t mem = llama_get_memory(ctx);
867 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
870 "[kv::write_file] WARNING: KV cache is empty - skipping write");
876 size_t bytes = llama_state_seq_save_file(ctx, filepath.c_str(), seq,
877 tokens.data(), tokens.size());
882 filepath.c_str(), bytes, bytes / 1024.0 / 1024.0,
912 const std::string &filepath) {
914 throw std::runtime_error(
"[kv::read_file] null context");
917 if (filepath.empty()) {
918 throw std::runtime_error(
"[kv::read_file] empty filepath");
922 const uint32_t n_ctx = llama_n_ctx(ctx);
924 std::vector<llama_token> tokens;
925 tokens.resize(n_ctx);
927 size_t token_count = 0;
930 llama_state_seq_load_file(ctx, filepath.c_str(), seq, tokens.data(),
931 tokens.size(), &token_count);
934 throw std::runtime_error(
"[kv::read_file] failed to load from " +
938 tokens.resize(token_count);
940 LLOYAL_LOG_DEBUG(
"[kv::read_file] Loaded %s: %zu bytes (%.1f MB), %zu tokens",
941 filepath.c_str(), bytes, bytes / 1024.0 / 1024.0,
944 return FileData{std::move(tokens), bytes};
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Batch Decoding Operations.
constexpr llama_seq_id NO_LEASE
Sentinel value indicating a branch has no KV residency.
int many(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_past, int32_t n_batch, llama_seq_id seq_id=0)
Decode multiple tokens into the KV cache with auto-chunking.
ggml_type from_str(const std::string &s)
Map string name to ggml_type enum (matches llama.cpp CLI -ctk/-ctv flags).
void evict_all(State &s)
Evict every leased seq_id.
llama_seq_id acquire(State &s)
Acquire a seq_id from the vacant pool.
size_t available(const State &s)
Number of vacant seq_ids available for acquisition.
void evict(State &s, llama_seq_id seq)
Evict a seq_id — strip all KV tags then release.
void retain(State &s, llama_seq_id keep)
Nuclear retain — keep one seq, rebuild vacancy from scratch.
State init(llama_context *ctx, llama_seq_id n_seq_max)
Initialize tenancy with all seq_ids vacant.
void release(State &s, llama_seq_id seq)
Release a seq_id back to vacant — bookkeeping only, no KV calls.
void seq_keep(llama_context *ctx, llama_seq_id seq)
Keep only one sequence, removing all others.
FileData read_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath)
void log_build_info(llama_context *ctx)
Log KV cache build info and current state.
size_t state_size(llama_context *ctx, llama_seq_id seq)
Get size needed to serialize sequence state.
void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst, llama_pos p0=0, llama_pos p1=-1)
Copy KV cache from one sequence to another.
void clear_and_reseed(llama_context *ctx, const std::vector< llama_token > &original_sinks, const std::vector< llama_token > &tail, int32_t n_batch)
size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst, size_t size)
Save sequence state to buffer.
void clear_metadata(llama_context *ctx)
Clear KV cache metadata only (fast reset)
size_t global_state_load(llama_context *ctx, const uint8_t *src, size_t size)
Restore global state from buffer.
size_t state_load(llama_context *ctx, llama_seq_id seq, const uint8_t *src, size_t size)
Restore sequence state from buffer.
llama_pos pos_max(llama_context *ctx, llama_seq_id seq)
Get maximum position in KV cache sequence.
void clear_all(llama_context *ctx)
Clear all KV cache (complete reset)
size_t global_state_size(llama_context *ctx)
Get size needed to serialize global state.
bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0, llama_pos p1)
Remove token range from KV cache sequence.
size_t write_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath, const std::vector< llama_token > &tokens)
Write KV state to file with self-describing format.
size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size)
Save global state to buffer.
Data structure returned by read_file.
std::vector< llama_token > tokens
Tokens restored from file.
size_t bytes_read
Total bytes read from file.
Tenancy state — tracks seq_id vacancy and leases.
llama_context * ctx
Context for KV operations (nullptr after drain)
std::vector< llama_seq_id > vacant
Available seq_ids (LIFO stack)
std::vector< uint8_t > leased
Bitmap: leased[seq] = 1 if issued.
llama_seq_id n_seq_max
Total seq_id capacity (from llama_n_seq_max)