liblloyal 1.0.0
Branched Inference for llama.cpp
Loading...
Searching...
No Matches
kv.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
6
29#include "common.hpp"
30#include "decode.hpp"
31#include <cassert>
32#include <cstdint>
33#include <llama/llama.h>
34#include <type_traits>
35#include <vector>
36
37namespace lloyal::kv {
38
39// ===== KV CACHE TYPE MAPPING =====
40
41namespace cache_type {
42
45inline ggml_type from_str(const std::string& s) {
46 if (s == "f32") return GGML_TYPE_F32;
47 if (s == "f16") return GGML_TYPE_F16;
48 if (s == "bf16") return GGML_TYPE_BF16;
49 if (s == "q8_0") return GGML_TYPE_Q8_0;
50 if (s == "q4_0") return GGML_TYPE_Q4_0;
51 if (s == "q4_1") return GGML_TYPE_Q4_1;
52 if (s == "iq4_nl") return GGML_TYPE_IQ4_NL;
53 if (s == "q5_0") return GGML_TYPE_Q5_0;
54 if (s == "q5_1") return GGML_TYPE_Q5_1;
55 return GGML_TYPE_COUNT;
56}
57
58} // namespace cache_type
59
60// ===== KV SEQUENCE OPERATIONS =====
61// Thin wrappers — tenancy is built on top of these.
62
78inline bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0,
79 llama_pos p1) {
80 if (!ctx) {
81 LLOYAL_LOG_DEBUG("[kv::remove_range] ERROR: null context");
82 return false;
83 }
84
85 llama_memory_t mem = llama_get_memory(ctx);
86 bool success = llama_memory_seq_rm(mem, seq, p0, p1);
87
88 if (!success) {
89 LLOYAL_LOG_DEBUG("[kv::remove_range] FAILED: seq=%d, p0=%d, p1=%d", seq, p0,
90 p1);
91 LLOYAL_LOG_DEBUG("[kv::remove_range] Guard-rail reminder: Ensure "
92 "remove_range called BEFORE next llama_decode()");
93 } else {
94 LLOYAL_LOG_DEBUG("[kv::remove_range] OK: seq=%d, removed tokens [%d, %d)",
95 seq, p0, p1);
96 }
97
98 return success;
99}
100
111inline llama_pos pos_max(llama_context *ctx, llama_seq_id seq) {
112 if (!ctx) {
113 LLOYAL_LOG_DEBUG("[kv::pos_max] ERROR: null context");
114 return -1;
115 }
116
117 llama_memory_t mem = llama_get_memory(ctx);
118 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
119
120 LLOYAL_LOG_DEBUG("[kv::pos_max] seq=%d, max_pos=%d", seq, max_pos);
121 return max_pos;
122}
123
138inline void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst,
139 llama_pos p0 = 0, llama_pos p1 = -1) {
140 if (!ctx) {
141 LLOYAL_LOG_DEBUG("[kv::seq_cp] ERROR: null context");
142 return;
143 }
144
145 llama_memory_t mem = llama_get_memory(ctx);
146 llama_memory_seq_cp(mem, src, dst, p0, p1);
147
148 LLOYAL_LOG_DEBUG("[kv::seq_cp] Copied seq %d → %d [%d, %d)", src, dst, p0, p1);
149}
150
162inline void seq_keep(llama_context *ctx, llama_seq_id seq) {
163 if (!ctx) {
164 LLOYAL_LOG_DEBUG("[kv::seq_keep] ERROR: null context");
165 return;
166 }
167
168 llama_memory_t mem = llama_get_memory(ctx);
169 llama_memory_seq_keep(mem, seq);
170
171 LLOYAL_LOG_DEBUG("[kv::seq_keep] Kept only seq %d", seq);
172}
173
174// ===== KV TENANCY =====
175
198static_assert(std::is_signed_v<llama_seq_id>,
199 "llama_seq_id must be signed for NO_LEASE sentinel");
200
207constexpr llama_seq_id NO_LEASE = static_cast<llama_seq_id>(-1);
208
209namespace tenancy {
210
217struct State {
218 llama_context* ctx = nullptr;
219 llama_seq_id n_seq_max = 0;
220 std::vector<llama_seq_id> vacant;
221 std::vector<uint8_t> leased;
222};
223
234inline State init(llama_context* ctx, llama_seq_id n_seq_max) {
235 State s;
236 s.ctx = ctx;
237 s.n_seq_max = n_seq_max;
238 s.leased.resize(static_cast<size_t>(n_seq_max), 0);
239 s.vacant.reserve(static_cast<size_t>(n_seq_max));
240 for (llama_seq_id i = n_seq_max; i-- > 0; ) {
241 s.vacant.push_back(i);
242 }
243 return s;
244}
245
256inline llama_seq_id acquire(State& s) {
257 if (s.vacant.empty()) return NO_LEASE;
258 llama_seq_id seq = s.vacant.back();
259 s.vacant.pop_back();
260 s.leased[static_cast<size_t>(seq)] = 1;
261 return seq;
262}
263
275inline void release(State& s, llama_seq_id seq) {
276 assert(seq >= 0 && seq < s.n_seq_max && "release: seq out of range");
277 assert(s.leased[static_cast<size_t>(seq)] && "release: seq not leased");
278 s.leased[static_cast<size_t>(seq)] = 0;
279 s.vacant.push_back(seq);
280}
281
297inline void evict(State& s, llama_seq_id seq) {
298 assert(seq >= 0 && seq < s.n_seq_max && "evict: seq out of range");
299 assert(s.leased[static_cast<size_t>(seq)] && "evict: seq not leased");
300 remove_range(s.ctx, seq, 0, -1);
301 release(s, seq);
302}
303
320inline void retain(State& s, llama_seq_id keep) {
321 assert(keep >= 0 && keep < s.n_seq_max && "retain: keep seq out of range");
322 assert(s.leased[static_cast<size_t>(keep)] && "retain: keep seq not leased");
323 seq_keep(s.ctx, keep);
324#ifndef NDEBUG
325 for (llama_seq_id i = 0; i < s.n_seq_max; ++i) {
326 if (i != keep) assert(pos_max(s.ctx, i) < 0 && "retain: seq_keep left dirty tags");
327 }
328#endif
329 s.vacant.clear();
330 // Reverse order matches init() — LIFO means lowest seq_id acquired first
331 for (llama_seq_id i = s.n_seq_max; i-- > 0; ) {
332 if (i == keep) {
333 s.leased[static_cast<size_t>(i)] = 1;
334 } else {
335 s.leased[static_cast<size_t>(i)] = 0;
336 s.vacant.push_back(i);
337 }
338 }
339}
340
351inline void evict_all(State& s) {
352 for (llama_seq_id i = 0; i < s.n_seq_max; ++i) {
353 if (s.leased[static_cast<size_t>(i)]) {
354 evict(s, i);
355 }
356 }
357}
358
365inline size_t available(const State& s) {
366 return s.vacant.size();
367}
368
369} // namespace tenancy
370
// end of tenancy group
372
373// ===== STATE SNAPSHOT OPERATIONS =====
374
388inline size_t state_size(llama_context *ctx, llama_seq_id seq) {
389 if (!ctx) {
390 LLOYAL_LOG_DEBUG("[kv::state_size] ERROR: null context");
391 return 0;
392 }
393
394 llama_memory_t mem = llama_get_memory(ctx);
395 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
396 if (max_pos < 0) {
397 LLOYAL_LOG_DEBUG("[kv::state_size] WARNING: KV cache is empty (max_pos=%d) "
398 "- returning 0",
399 max_pos);
400 return 0;
401 }
402
403 size_t size = llama_state_seq_get_size(ctx, seq);
404
405 if (size == 0) {
407 "[kv::state_size] Per-sequence size query failed for seq=%d", seq);
409 "[kv::state_size] Attempting global state size (fallback)");
410 size = llama_state_get_size(ctx);
411
412 if (size > 0) {
413 LLOYAL_LOG_DEBUG("[kv::state_size] Global fallback size: %zu bytes",
414 size);
415 } else {
416 LLOYAL_LOG_DEBUG("[kv::state_size] ERROR: Both per-sequence and global "
417 "size queries failed");
418 }
419 } else {
421 "[kv::state_size] Per-sequence size for seq=%d: %zu bytes (%.1f MB)",
422 seq, size, size / 1024.0 / 1024.0);
423 }
424
425 return size;
426}
427
443inline size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst,
444 size_t size) {
445 if (!ctx || !dst || size == 0) {
447 "[kv::state_save] ERROR: invalid parameters (ctx=%p, dst=%p, size=%zu)",
448 ctx, dst, size);
449 return 0;
450 }
451
452 llama_memory_t mem = llama_get_memory(ctx);
453 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
454 if (max_pos < 0) {
455 LLOYAL_LOG_DEBUG("[kv::state_save] WARNING: KV cache is empty (max_pos=%d) "
456 "- skipping save",
457 max_pos);
458 return 0;
459 }
460
461 size_t written = llama_state_seq_get_data(ctx, dst, size, seq);
462
463 if (written == 0) {
464 LLOYAL_LOG_DEBUG("[kv::state_save] Per-sequence save failed for seq=%d "
465 "(possible KV fragmentation)",
466 seq);
468 "[kv::state_save] Attempting global state save (fallback)");
469 written = llama_state_get_data(ctx, dst, size);
470
471 if (written > 0) {
473 "[kv::state_save] Global fallback succeeded: %zu bytes (%.1f MB)",
474 written, written / 1024.0 / 1024.0);
475 } else {
477 "[kv::state_save] ERROR: Both per-sequence and global save failed");
478 }
479 } else {
481 "[kv::state_save] Per-sequence saved %zu bytes (%.1f MB) for seq=%d",
482 written, written / 1024.0 / 1024.0, seq);
483 }
484
485 return written;
486}
487
504inline size_t state_load(llama_context *ctx, llama_seq_id seq,
505 const uint8_t *src, size_t size) {
506 if (!ctx || !src || size == 0) {
508 "[kv::state_load] ERROR: invalid parameters (ctx=%p, src=%p, size=%zu)",
509 ctx, src, size);
510 return 0;
511 }
512
513 llama_memory_t mem = llama_get_memory(ctx);
514 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
515 if (max_pos < 0) {
516 LLOYAL_LOG_DEBUG("[kv::state_load] WARNING: KV cache is empty (max_pos=%d) "
517 "- loading may crash on recurrent models",
518 max_pos);
519 }
520
521 size_t read = llama_state_seq_set_data(ctx, src, size, seq);
522
523 if (read == 0) {
524 LLOYAL_LOG_DEBUG("[kv::state_load] Per-sequence restore failed for seq=%d "
525 "(possible fragmentation)",
526 seq);
528 "[kv::state_load] Attempting global state restore (fallback)");
529 read = llama_state_set_data(ctx, src, size);
530
531 if (read > 0) {
533 "[kv::state_load] Global fallback succeeded: %zu bytes (%.1f MB)",
534 read, read / 1024.0 / 1024.0);
535 } else {
536 LLOYAL_LOG_DEBUG("[kv::state_load] ERROR: Both per-sequence and global "
537 "restore failed");
538 }
539 } else {
541 "[kv::state_load] Per-sequence loaded %zu bytes (%.1f MB) for seq=%d",
542 read, read / 1024.0 / 1024.0, seq);
543 }
544
545 return read;
546}
547
548// ===== GLOBAL STATE OPERATIONS =====
549
559inline size_t global_state_size(llama_context *ctx) {
560 if (!ctx) {
561 LLOYAL_LOG_DEBUG("[kv::global_state_size] ERROR: null context");
562 return 0;
563 }
564
565 size_t size = llama_state_get_size(ctx);
566 LLOYAL_LOG_DEBUG("[kv::global_state_size] %zu bytes (%.1f MB)", size,
567 size / 1024.0 / 1024.0);
568 return size;
569}
570
581inline size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size) {
582 if (!ctx || !dst || size == 0) {
583 LLOYAL_LOG_DEBUG("[kv::global_state_save] ERROR: invalid parameters");
584 return 0;
585 }
586
587 size_t written = llama_state_get_data(ctx, dst, size);
588 LLOYAL_LOG_DEBUG("[kv::global_state_save] %zu bytes written (%.1f MB)",
589 written, written / 1024.0 / 1024.0);
590 return written;
591}
592
603inline size_t global_state_load(llama_context *ctx, const uint8_t *src,
604 size_t size) {
605 if (!ctx || !src || size == 0) {
606 LLOYAL_LOG_DEBUG("[kv::global_state_load] ERROR: invalid parameters");
607 return 0;
608 }
609
610 size_t read = llama_state_set_data(ctx, src, size);
611 LLOYAL_LOG_DEBUG("[kv::global_state_load] %zu bytes read (%.1f MB)", read,
612 read / 1024.0 / 1024.0);
613 return read;
614}
615
616// ===== DIAGNOSTICS =====
617
628inline void log_build_info(llama_context *ctx) {
630 "[kv::build_info] ============================================");
632 "[kv::build_info] llama.cpp KV Sequence Operations Configuration");
634 "[kv::build_info] ============================================");
635 LLOYAL_LOG_DEBUG("[kv::build_info] Version: b8087");
636 LLOYAL_LOG_DEBUG("[kv::build_info] API naming: llama_memory_seq_*");
638 "[kv::build_info] Current MVP: n_seq_max=1 (single sequence only)");
639
640 if (ctx) {
641 llama_pos max_pos = pos_max(ctx, 0);
642 if (max_pos >= 0) {
643 LLOYAL_LOG_DEBUG("[kv::build_info] Current KV cursor (seq 0): %d tokens",
644 max_pos);
645 } else {
646 LLOYAL_LOG_DEBUG("[kv::build_info] KV cache empty (seq 0)");
647 }
648
649 size_t snapshot_size = state_size(ctx, 0);
650 if (snapshot_size > 0) {
652 "[kv::build_info] Estimated snapshot size: %zu bytes (%.1f MB)",
653 snapshot_size, snapshot_size / 1024.0 / 1024.0);
654 }
655 }
656
658 "[kv::build_info] Fragmentation fallback: per-sequence → global state");
660 "[kv::build_info] Critical: Call remove_range() BEFORE llama_decode()");
662 "[kv::build_info] ============================================");
663}
664
665// ===== CACHE CLEARING =====
666
683inline void clear_all(llama_context *ctx) {
684 if (!ctx) {
685 LLOYAL_LOG_DEBUG("[kv::clear_all] ERROR: NULL context");
686 throw std::runtime_error("kv::clear_all - NULL context");
687 }
688
689 LLOYAL_LOG_DEBUG("[kv::clear_all] Clearing KV cache (metadata + data)");
690 llama_memory_clear(llama_get_memory(ctx), true); // true = clear data buffers too
691 LLOYAL_LOG_DEBUG("[kv::clear_all] KV cache cleared");
692}
693
708inline void clear_metadata(llama_context *ctx) {
709 if (!ctx) {
710 LLOYAL_LOG_DEBUG("[kv::clear_metadata] ERROR: NULL context");
711 throw std::runtime_error("kv::clear_metadata - NULL context");
712 }
713
714 LLOYAL_LOG_DEBUG("[kv::clear_metadata] Clearing KV cache metadata only");
715 llama_memory_clear(llama_get_memory(ctx), false); // false = keep data buffers
716 LLOYAL_LOG_DEBUG("[kv::clear_metadata] KV cache metadata cleared");
717}
718
719// ===== CONTEXT COMPRESSION =====
720
753inline void clear_and_reseed(llama_context *ctx,
754 const std::vector<llama_token> &original_sinks,
755 const std::vector<llama_token> &tail,
756 int32_t n_batch) {
757 if (!ctx) {
758 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] ERROR: null context");
759 throw std::runtime_error("kv::clear_and_reseed - NULL context");
760 }
761
762 if (original_sinks.empty() && tail.empty()) {
763 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] ERROR: both sinks and tail are empty");
764 throw std::runtime_error("kv::clear_and_reseed - no tokens to reseed");
765 }
766
767 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Starting reseed: %zu sinks + %zu tail = %zu total",
768 original_sinks.size(), tail.size(), original_sinks.size() + tail.size());
769
770 // Get memory handle
771 llama_memory_t mem = llama_get_memory(ctx);
772
773 // Log state before clear
774 llama_pos max_pos_before = llama_memory_seq_pos_max(mem, 0);
775 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Before clear: KV cache max_pos=%d", max_pos_before);
776
777 // Clear entire KV cache (simple and reliable)
778 llama_memory_clear(mem, true);
779
780 llama_pos max_pos_after_clear = llama_memory_seq_pos_max(mem, 0);
781 if (max_pos_after_clear != -1) {
782 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] WARNING: KV cache not empty after clear (max_pos=%d)",
783 max_pos_after_clear);
784 }
785
786 // Re-decode sinks at position 0
787 if (!original_sinks.empty()) {
788 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Re-decoding %zu sinks at position 0", original_sinks.size());
789 if (lloyal::decode::many(ctx, original_sinks, 0, n_batch) != 0) {
790 throw std::runtime_error("kv::clear_and_reseed - llama_decode failed on sinks");
791 }
792 }
793
794 // Re-decode tail at position sinks.size()
795 if (!tail.empty()) {
796 int32_t tail_start_pos = static_cast<int32_t>(original_sinks.size());
797 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Re-decoding %zu tail tokens at position %d",
798 tail.size(), tail_start_pos);
799 if (lloyal::decode::many(ctx, tail, tail_start_pos, n_batch) != 0) {
800 throw std::runtime_error("kv::clear_and_reseed - llama_decode failed on tail");
801 }
802 }
803
804 // Verify final state
805 llama_pos max_pos_after = llama_memory_seq_pos_max(mem, 0);
806 int32_t expected_pos = static_cast<int32_t>(original_sinks.size() + tail.size()) - 1;
807
808 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] After reseed: KV cache max_pos=%d (expected %d)",
809 max_pos_after, expected_pos);
810
811 if (max_pos_after != expected_pos) {
812 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] WARNING: Unexpected final position (got %d, expected %d)",
813 max_pos_after, expected_pos);
814 }
815
816 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Reseed complete");
817}
818
819// ===== FILE PERSISTENCE =====
820
826struct FileData {
827 std::vector<llama_token> tokens;
828 size_t bytes_read;
829};
830
852inline size_t write_file(llama_context *ctx, llama_seq_id seq,
853 const std::string &filepath,
854 const std::vector<llama_token> &tokens) {
855 if (!ctx) {
856 LLOYAL_LOG_DEBUG("[kv::write_file] ERROR: null context");
857 return 0;
858 }
859
860 if (filepath.empty()) {
861 LLOYAL_LOG_DEBUG("[kv::write_file] ERROR: empty filepath");
862 return 0;
863 }
864
865 // Guard: Don't write if KV cache is empty
866 llama_memory_t mem = llama_get_memory(ctx);
867 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
868 if (max_pos < 0) {
870 "[kv::write_file] WARNING: KV cache is empty - skipping write");
871 return 0;
872 }
873
874 // Delegate to llama.cpp's session file writer
875 // Note: llama.cpp signature is (ctx, filepath, seq_id, tokens, n_tokens)
876 size_t bytes = llama_state_seq_save_file(ctx, filepath.c_str(), seq,
877 tokens.data(), tokens.size());
878
879 if (bytes > 0) {
880 LLOYAL_LOG_DEBUG("[kv::write_file] Wrote %s: %zu bytes (%.1f MB), %zu "
881 "tokens",
882 filepath.c_str(), bytes, bytes / 1024.0 / 1024.0,
883 tokens.size());
884 } else {
885 LLOYAL_LOG_DEBUG("[kv::write_file] FAILED to write %s", filepath.c_str());
886 }
887
888 return bytes;
889}
890
911inline FileData read_file(llama_context *ctx, llama_seq_id seq,
912 const std::string &filepath) {
913 if (!ctx) {
914 throw std::runtime_error("[kv::read_file] null context");
915 }
916
917 if (filepath.empty()) {
918 throw std::runtime_error("[kv::read_file] empty filepath");
919 }
920
921 // Get model's n_ctx to allocate token buffer
922 const uint32_t n_ctx = llama_n_ctx(ctx);
923
924 std::vector<llama_token> tokens;
925 tokens.resize(n_ctx); // Allocate buffer with capacity
926
927 size_t token_count = 0;
928 // Note: llama.cpp signature is (ctx, filepath, seq_id, tokens_out, capacity, count_out)
929 size_t bytes =
930 llama_state_seq_load_file(ctx, filepath.c_str(), seq, tokens.data(),
931 tokens.size(), &token_count);
932
933 if (bytes == 0) {
934 throw std::runtime_error("[kv::read_file] failed to load from " +
935 filepath);
936 }
937
938 tokens.resize(token_count);
939
940 LLOYAL_LOG_DEBUG("[kv::read_file] Loaded %s: %zu bytes (%.1f MB), %zu tokens",
941 filepath.c_str(), bytes, bytes / 1024.0 / 1024.0,
942 token_count);
943
944 return FileData{std::move(tokens), bytes};
945}
946
947} // namespace lloyal::kv
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:48
Batch Decoding Operations.
constexpr llama_seq_id NO_LEASE
Sentinel value indicating a branch has no KV residency.
Definition kv.hpp:207
int many(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_past, int32_t n_batch, llama_seq_id seq_id=0)
Decode multiple tokens into the KV cache with auto-chunking.
Definition decode.hpp:125
ggml_type from_str(const std::string &s)
Map string name to ggml_type enum (matches llama.cpp CLI -ctk/-ctv flags).
Definition kv.hpp:45
void evict_all(State &s)
Evict every leased seq_id.
Definition kv.hpp:351
llama_seq_id acquire(State &s)
Acquire a seq_id from the vacant pool.
Definition kv.hpp:256
size_t available(const State &s)
Number of vacant seq_ids available for acquisition.
Definition kv.hpp:365
void evict(State &s, llama_seq_id seq)
Evict a seq_id — strip all KV tags then release.
Definition kv.hpp:297
void retain(State &s, llama_seq_id keep)
Nuclear retain — keep one seq, rebuild vacancy from scratch.
Definition kv.hpp:320
State init(llama_context *ctx, llama_seq_id n_seq_max)
Initialize tenancy with all seq_ids vacant.
Definition kv.hpp:234
void release(State &s, llama_seq_id seq)
Release a seq_id back to vacant — bookkeeping only, no KV calls.
Definition kv.hpp:275
void seq_keep(llama_context *ctx, llama_seq_id seq)
Keep only one sequence, removing all others.
Definition kv.hpp:162
FileData read_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath)
Definition kv.hpp:911
void log_build_info(llama_context *ctx)
Log KV cache build info and current state.
Definition kv.hpp:628
size_t state_size(llama_context *ctx, llama_seq_id seq)
Get size needed to serialize sequence state.
Definition kv.hpp:388
void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst, llama_pos p0=0, llama_pos p1=-1)
Copy KV cache from one sequence to another.
Definition kv.hpp:138
void clear_and_reseed(llama_context *ctx, const std::vector< llama_token > &original_sinks, const std::vector< llama_token > &tail, int32_t n_batch)
Definition kv.hpp:753
size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst, size_t size)
Save sequence state to buffer.
Definition kv.hpp:443
void clear_metadata(llama_context *ctx)
Clear KV cache metadata only (fast reset)
Definition kv.hpp:708
size_t global_state_load(llama_context *ctx, const uint8_t *src, size_t size)
Restore global state from buffer.
Definition kv.hpp:603
size_t state_load(llama_context *ctx, llama_seq_id seq, const uint8_t *src, size_t size)
Restore sequence state from buffer.
Definition kv.hpp:504
llama_pos pos_max(llama_context *ctx, llama_seq_id seq)
Get maximum position in KV cache sequence.
Definition kv.hpp:111
void clear_all(llama_context *ctx)
Clear all KV cache (complete reset)
Definition kv.hpp:683
size_t global_state_size(llama_context *ctx)
Get size needed to serialize global state.
Definition kv.hpp:559
bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0, llama_pos p1)
Remove token range from KV cache sequence.
Definition kv.hpp:78
size_t write_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath, const std::vector< llama_token > &tokens)
Write KV state to file with self-describing format.
Definition kv.hpp:852
size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size)
Save global state to buffer.
Definition kv.hpp:581
Data structure returned by read_file.
Definition kv.hpp:826
std::vector< llama_token > tokens
Tokens restored from file.
Definition kv.hpp:827
size_t bytes_read
Total bytes read from file.
Definition kv.hpp:828
Tenancy state — tracks seq_id vacancy and leases.
Definition kv.hpp:217
llama_context * ctx
Context for KV operations (nullptr after drain)
Definition kv.hpp:218
std::vector< llama_seq_id > vacant
Available seq_ids (LIFO stack)
Definition kv.hpp:220
std::vector< uint8_t > leased
Bitmap: leased[seq] = 1 if issued.
Definition kv.hpp:221
llama_seq_id n_seq_max
Total seq_id capacity (from llama_n_seq_max)
Definition kv.hpp:219