liblloyal 1.0.0
Composable primitives for llama.cpp inference
Loading...
Searching...
No Matches
kv.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
29#include "common.hpp"
30#include "decoder.hpp"
31#include <cstdint>
32#include <llama/llama.h>
33#include <vector>
34
35namespace lloyal::kv {
36
37// ===== KV SEQUENCE OPERATIONS =====
38
54inline bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0,
55 llama_pos p1) {
56 if (!ctx) {
57 LLOYAL_LOG_DEBUG("[kv::remove_range] ERROR: null context");
58 return false;
59 }
60
61 llama_memory_t mem = llama_get_memory(ctx);
62 bool success = llama_memory_seq_rm(mem, seq, p0, p1);
63
64 if (!success) {
65 LLOYAL_LOG_DEBUG("[kv::remove_range] FAILED: seq=%d, p0=%d, p1=%d", seq, p0,
66 p1);
67 LLOYAL_LOG_DEBUG("[kv::remove_range] Guard-rail reminder: Ensure "
68 "remove_range called BEFORE next llama_decode()");
69 } else {
70 LLOYAL_LOG_DEBUG("[kv::remove_range] OK: seq=%d, removed tokens [%d, %d)",
71 seq, p0, p1);
72 }
73
74 return success;
75}
76
87inline llama_pos pos_max(llama_context *ctx, llama_seq_id seq) {
88 if (!ctx) {
89 LLOYAL_LOG_DEBUG("[kv::pos_max] ERROR: null context");
90 return -1;
91 }
92
93 llama_memory_t mem = llama_get_memory(ctx);
94 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
95
96 LLOYAL_LOG_DEBUG("[kv::pos_max] seq=%d, max_pos=%d", seq, max_pos);
97 return max_pos;
98}
99
114inline void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst,
115 llama_pos p0 = 0, llama_pos p1 = -1) {
116 if (!ctx) {
117 LLOYAL_LOG_DEBUG("[kv::seq_cp] ERROR: null context");
118 return;
119 }
120
121 llama_memory_t mem = llama_get_memory(ctx);
122 llama_memory_seq_cp(mem, src, dst, p0, p1);
123
124 LLOYAL_LOG_DEBUG("[kv::seq_cp] Copied seq %d → %d [%d, %d)", src, dst, p0, p1);
125}
126
138inline void seq_keep(llama_context *ctx, llama_seq_id seq) {
139 if (!ctx) {
140 LLOYAL_LOG_DEBUG("[kv::seq_keep] ERROR: null context");
141 return;
142 }
143
144 llama_memory_t mem = llama_get_memory(ctx);
145 llama_memory_seq_keep(mem, seq);
146
147 LLOYAL_LOG_DEBUG("[kv::seq_keep] Kept only seq %d", seq);
148}
149
150// ===== STATE SNAPSHOT OPERATIONS =====
151
165inline size_t state_size(llama_context *ctx, llama_seq_id seq) {
166 if (!ctx) {
167 LLOYAL_LOG_DEBUG("[kv::state_size] ERROR: null context");
168 return 0;
169 }
170
171 llama_memory_t mem = llama_get_memory(ctx);
172 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
173 if (max_pos < 0) {
174 LLOYAL_LOG_DEBUG("[kv::state_size] WARNING: KV cache is empty (max_pos=%d) "
175 "- returning 0",
176 max_pos);
177 return 0;
178 }
179
180 size_t size = llama_state_seq_get_size(ctx, seq);
181
182 if (size == 0) {
184 "[kv::state_size] Per-sequence size query failed for seq=%d", seq);
186 "[kv::state_size] Attempting global state size (fallback)");
187 size = llama_state_get_size(ctx);
188
189 if (size > 0) {
190 LLOYAL_LOG_DEBUG("[kv::state_size] Global fallback size: %zu bytes",
191 size);
192 } else {
193 LLOYAL_LOG_DEBUG("[kv::state_size] ERROR: Both per-sequence and global "
194 "size queries failed");
195 }
196 } else {
198 "[kv::state_size] Per-sequence size for seq=%d: %zu bytes (%.1f MB)",
199 seq, size, size / 1024.0 / 1024.0);
200 }
201
202 return size;
203}
204
220inline size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst,
221 size_t size) {
222 if (!ctx || !dst || size == 0) {
224 "[kv::state_save] ERROR: invalid parameters (ctx=%p, dst=%p, size=%zu)",
225 ctx, dst, size);
226 return 0;
227 }
228
229 llama_memory_t mem = llama_get_memory(ctx);
230 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
231 if (max_pos < 0) {
232 LLOYAL_LOG_DEBUG("[kv::state_save] WARNING: KV cache is empty (max_pos=%d) "
233 "- skipping save",
234 max_pos);
235 return 0;
236 }
237
238 size_t written = llama_state_seq_get_data(ctx, dst, size, seq);
239
240 if (written == 0) {
241 LLOYAL_LOG_DEBUG("[kv::state_save] Per-sequence save failed for seq=%d "
242 "(possible KV fragmentation)",
243 seq);
245 "[kv::state_save] Attempting global state save (fallback)");
246 written = llama_state_get_data(ctx, dst, size);
247
248 if (written > 0) {
250 "[kv::state_save] Global fallback succeeded: %zu bytes (%.1f MB)",
251 written, written / 1024.0 / 1024.0);
252 } else {
254 "[kv::state_save] ERROR: Both per-sequence and global save failed");
255 }
256 } else {
258 "[kv::state_save] Per-sequence saved %zu bytes (%.1f MB) for seq=%d",
259 written, written / 1024.0 / 1024.0, seq);
260 }
261
262 return written;
263}
264
281inline size_t state_load(llama_context *ctx, llama_seq_id seq,
282 const uint8_t *src, size_t size) {
283 if (!ctx || !src || size == 0) {
285 "[kv::state_load] ERROR: invalid parameters (ctx=%p, src=%p, size=%zu)",
286 ctx, src, size);
287 return 0;
288 }
289
290 llama_memory_t mem = llama_get_memory(ctx);
291 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
292 if (max_pos < 0) {
293 LLOYAL_LOG_DEBUG("[kv::state_load] WARNING: KV cache is empty (max_pos=%d) "
294 "- loading may crash on recurrent models",
295 max_pos);
296 }
297
298 size_t read = llama_state_seq_set_data(ctx, src, size, seq);
299
300 if (read == 0) {
301 LLOYAL_LOG_DEBUG("[kv::state_load] Per-sequence restore failed for seq=%d "
302 "(possible fragmentation)",
303 seq);
305 "[kv::state_load] Attempting global state restore (fallback)");
306 read = llama_state_set_data(ctx, src, size);
307
308 if (read > 0) {
310 "[kv::state_load] Global fallback succeeded: %zu bytes (%.1f MB)",
311 read, read / 1024.0 / 1024.0);
312 } else {
313 LLOYAL_LOG_DEBUG("[kv::state_load] ERROR: Both per-sequence and global "
314 "restore failed");
315 }
316 } else {
318 "[kv::state_load] Per-sequence loaded %zu bytes (%.1f MB) for seq=%d",
319 read, read / 1024.0 / 1024.0, seq);
320 }
321
322 return read;
323}
324
325// ===== GLOBAL STATE OPERATIONS =====
326
336inline size_t global_state_size(llama_context *ctx) {
337 if (!ctx) {
338 LLOYAL_LOG_DEBUG("[kv::global_state_size] ERROR: null context");
339 return 0;
340 }
341
342 size_t size = llama_state_get_size(ctx);
343 LLOYAL_LOG_DEBUG("[kv::global_state_size] %zu bytes (%.1f MB)", size,
344 size / 1024.0 / 1024.0);
345 return size;
346}
347
358inline size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size) {
359 if (!ctx || !dst || size == 0) {
360 LLOYAL_LOG_DEBUG("[kv::global_state_save] ERROR: invalid parameters");
361 return 0;
362 }
363
364 size_t written = llama_state_get_data(ctx, dst, size);
365 LLOYAL_LOG_DEBUG("[kv::global_state_save] %zu bytes written (%.1f MB)",
366 written, written / 1024.0 / 1024.0);
367 return written;
368}
369
380inline size_t global_state_load(llama_context *ctx, const uint8_t *src,
381 size_t size) {
382 if (!ctx || !src || size == 0) {
383 LLOYAL_LOG_DEBUG("[kv::global_state_load] ERROR: invalid parameters");
384 return 0;
385 }
386
387 size_t read = llama_state_set_data(ctx, src, size);
388 LLOYAL_LOG_DEBUG("[kv::global_state_load] %zu bytes read (%.1f MB)", read,
389 read / 1024.0 / 1024.0);
390 return read;
391}
392
393// ===== DIAGNOSTICS =====
394
405inline void log_build_info(llama_context *ctx) {
407 "[kv::build_info] ============================================");
409 "[kv::build_info] llama.cpp KV Sequence Operations Configuration");
411 "[kv::build_info] ============================================");
412 LLOYAL_LOG_DEBUG("[kv::build_info] Version: b6870");
413 LLOYAL_LOG_DEBUG("[kv::build_info] API naming: llama_memory_seq_*");
415 "[kv::build_info] Current MVP: n_seq_max=1 (single sequence only)");
416
417 if (ctx) {
418 llama_pos max_pos = pos_max(ctx, 0);
419 if (max_pos >= 0) {
420 LLOYAL_LOG_DEBUG("[kv::build_info] Current KV cursor (seq 0): %d tokens",
421 max_pos);
422 } else {
423 LLOYAL_LOG_DEBUG("[kv::build_info] KV cache empty (seq 0)");
424 }
425
426 size_t snapshot_size = state_size(ctx, 0);
427 if (snapshot_size > 0) {
429 "[kv::build_info] Estimated snapshot size: %zu bytes (%.1f MB)",
430 snapshot_size, snapshot_size / 1024.0 / 1024.0);
431 }
432 }
433
435 "[kv::build_info] Fragmentation fallback: per-sequence → global state");
437 "[kv::build_info] Critical: Call remove_range() BEFORE llama_decode()");
439 "[kv::build_info] ============================================");
440}
441
442// ===== CACHE CLEARING =====
443
460inline void clear_all(llama_context *ctx) {
461 if (!ctx) {
462 LLOYAL_LOG_DEBUG("[kv::clear_all] ERROR: NULL context");
463 throw std::runtime_error("kv::clear_all - NULL context");
464 }
465
466 LLOYAL_LOG_DEBUG("[kv::clear_all] Clearing KV cache (metadata + data)");
467 llama_memory_clear(llama_get_memory(ctx), true); // true = clear data buffers too
468 LLOYAL_LOG_DEBUG("[kv::clear_all] KV cache cleared");
469}
470
485inline void clear_metadata(llama_context *ctx) {
486 if (!ctx) {
487 LLOYAL_LOG_DEBUG("[kv::clear_metadata] ERROR: NULL context");
488 throw std::runtime_error("kv::clear_metadata - NULL context");
489 }
490
491 LLOYAL_LOG_DEBUG("[kv::clear_metadata] Clearing KV cache metadata only");
492 llama_memory_clear(llama_get_memory(ctx), false); // false = keep data buffers
493 LLOYAL_LOG_DEBUG("[kv::clear_metadata] KV cache metadata cleared");
494}
495
496// ===== CONTEXT COMPRESSION =====
497
530inline void clear_and_reseed(llama_context *ctx,
531 const std::vector<llama_token> &original_sinks,
532 const std::vector<llama_token> &tail,
533 int32_t n_batch) {
534 if (!ctx) {
535 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] ERROR: null context");
536 throw std::runtime_error("kv::clear_and_reseed - NULL context");
537 }
538
539 if (original_sinks.empty() && tail.empty()) {
540 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] ERROR: both sinks and tail are empty");
541 throw std::runtime_error("kv::clear_and_reseed - no tokens to reseed");
542 }
543
544 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Starting reseed: %zu sinks + %zu tail = %zu total",
545 original_sinks.size(), tail.size(), original_sinks.size() + tail.size());
546
547 // Get memory handle
548 llama_memory_t mem = llama_get_memory(ctx);
549
550 // Log state before clear
551 llama_pos max_pos_before = llama_memory_seq_pos_max(mem, 0);
552 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Before clear: KV cache max_pos=%d", max_pos_before);
553
554 // Clear entire KV cache (simple and reliable)
555 llama_memory_clear(mem, true);
556
557 llama_pos max_pos_after_clear = llama_memory_seq_pos_max(mem, 0);
558 if (max_pos_after_clear != -1) {
559 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] WARNING: KV cache not empty after clear (max_pos=%d)",
560 max_pos_after_clear);
561 }
562
563 // Re-decode sinks at position 0
564 if (!original_sinks.empty()) {
565 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Re-decoding %zu sinks at position 0", original_sinks.size());
566 lloyal::decoder::decode_tokens(ctx, original_sinks, 0, n_batch);
567 }
568
569 // Re-decode tail at position sinks.size()
570 if (!tail.empty()) {
571 int32_t tail_start_pos = static_cast<int32_t>(original_sinks.size());
572 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Re-decoding %zu tail tokens at position %d",
573 tail.size(), tail_start_pos);
574 lloyal::decoder::decode_tokens(ctx, tail, tail_start_pos, n_batch);
575 }
576
577 // Verify final state
578 llama_pos max_pos_after = llama_memory_seq_pos_max(mem, 0);
579 int32_t expected_pos = static_cast<int32_t>(original_sinks.size() + tail.size()) - 1;
580
581 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] After reseed: KV cache max_pos=%d (expected %d)",
582 max_pos_after, expected_pos);
583
584 if (max_pos_after != expected_pos) {
585 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] WARNING: Unexpected final position (got %d, expected %d)",
586 max_pos_after, expected_pos);
587 }
588
589 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Reseed complete");
590}
591
592// ===== FILE PERSISTENCE =====
593
599struct FileData {
600 std::vector<llama_token> tokens;
601 size_t bytes_read;
602};
603
625inline size_t write_file(llama_context *ctx, llama_seq_id seq,
626 const std::string &filepath,
627 const std::vector<llama_token> &tokens) {
628 if (!ctx) {
629 LLOYAL_LOG_DEBUG("[kv::write_file] ERROR: null context");
630 return 0;
631 }
632
633 if (filepath.empty()) {
634 LLOYAL_LOG_DEBUG("[kv::write_file] ERROR: empty filepath");
635 return 0;
636 }
637
638 // Guard: Don't write if KV cache is empty
639 llama_memory_t mem = llama_get_memory(ctx);
640 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
641 if (max_pos < 0) {
643 "[kv::write_file] WARNING: KV cache is empty - skipping write");
644 return 0;
645 }
646
647 // Delegate to llama.cpp's session file writer
648 // Note: llama.cpp signature is (ctx, filepath, seq_id, tokens, n_tokens)
649 size_t bytes = llama_state_seq_save_file(ctx, filepath.c_str(), seq,
650 tokens.data(), tokens.size());
651
652 if (bytes > 0) {
653 LLOYAL_LOG_DEBUG("[kv::write_file] Wrote %s: %zu bytes (%.1f MB), %zu "
654 "tokens",
655 filepath.c_str(), bytes, bytes / 1024.0 / 1024.0,
656 tokens.size());
657 } else {
658 LLOYAL_LOG_DEBUG("[kv::write_file] FAILED to write %s", filepath.c_str());
659 }
660
661 return bytes;
662}
663
684inline FileData read_file(llama_context *ctx, llama_seq_id seq,
685 const std::string &filepath) {
686 if (!ctx) {
687 throw std::runtime_error("[kv::read_file] null context");
688 }
689
690 if (filepath.empty()) {
691 throw std::runtime_error("[kv::read_file] empty filepath");
692 }
693
694 // Get model's n_ctx to allocate token buffer
695 const uint32_t n_ctx = llama_n_ctx(ctx);
696
697 std::vector<llama_token> tokens;
698 tokens.resize(n_ctx); // Allocate buffer with capacity
699
700 size_t token_count = 0;
701 // Note: llama.cpp signature is (ctx, filepath, seq_id, tokens_out, capacity, count_out)
702 size_t bytes =
703 llama_state_seq_load_file(ctx, filepath.c_str(), seq, tokens.data(),
704 tokens.size(), &token_count);
705
706 if (bytes == 0) {
707 throw std::runtime_error("[kv::read_file] failed to load from " +
708 filepath);
709 }
710
711 tokens.resize(token_count);
712
713 LLOYAL_LOG_DEBUG("[kv::read_file] Loaded %s: %zu bytes (%.1f MB), %zu tokens",
714 filepath.c_str(), bytes, bytes / 1024.0 / 1024.0,
715 token_count);
716
717 return FileData{std::move(tokens), bytes};
718}
719
720} // namespace lloyal::kv
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:47
Batch Decoding Operations.
void decode_tokens(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_past, int32_t n_batch, llama_seq_id seq_id=0)
Process tokens through model to update KV cache.
Definition decoder.hpp:127
void seq_keep(llama_context *ctx, llama_seq_id seq)
Keep only one sequence, removing all others.
Definition kv.hpp:138
FileData read_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath)
Definition kv.hpp:684
void log_build_info(llama_context *ctx)
Log KV cache build info and current state.
Definition kv.hpp:405
size_t state_size(llama_context *ctx, llama_seq_id seq)
Get size needed to serialize sequence state.
Definition kv.hpp:165
void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst, llama_pos p0=0, llama_pos p1=-1)
Copy KV cache from one sequence to another.
Definition kv.hpp:114
void clear_and_reseed(llama_context *ctx, const std::vector< llama_token > &original_sinks, const std::vector< llama_token > &tail, int32_t n_batch)
Definition kv.hpp:530
size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst, size_t size)
Save sequence state to buffer.
Definition kv.hpp:220
void clear_metadata(llama_context *ctx)
Clear KV cache metadata only (fast reset)
Definition kv.hpp:485
size_t global_state_load(llama_context *ctx, const uint8_t *src, size_t size)
Restore global state from buffer.
Definition kv.hpp:380
size_t state_load(llama_context *ctx, llama_seq_id seq, const uint8_t *src, size_t size)
Restore sequence state from buffer.
Definition kv.hpp:281
llama_pos pos_max(llama_context *ctx, llama_seq_id seq)
Get maximum position in KV cache sequence.
Definition kv.hpp:87
void clear_all(llama_context *ctx)
Clear all KV cache (complete reset)
Definition kv.hpp:460
size_t global_state_size(llama_context *ctx)
Get size needed to serialize global state.
Definition kv.hpp:336
bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0, llama_pos p1)
Remove token range from KV cache sequence.
Definition kv.hpp:54
size_t write_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath, const std::vector< llama_token > &tokens)
Write KV state to file with self-describing format.
Definition kv.hpp:625
size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size)
Save global state to buffer.
Definition kv.hpp:358
Data structure returned by read_file.
Definition kv.hpp:599
std::vector< llama_token > tokens
Tokens restored from file.
Definition kv.hpp:600
size_t bytes_read
Total bytes read from file.
Definition kv.hpp:601