liblloyal 1.0.0
Branched Inference for llama.cpp
Loading...
Searching...
No Matches
kv.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
28#include "common.hpp"
29#include "decode.hpp"
30#include <cassert>
31#include <cstdint>
32#include <llama/llama.h>
33#include <type_traits>
34#include <vector>
35
36namespace lloyal::kv {
37
38// ===== KV CACHE TYPE MAPPING =====
39
40namespace cache_type {
41
44inline ggml_type from_str(const std::string& s) {
45 if (s == "f32") return GGML_TYPE_F32;
46 if (s == "f16") return GGML_TYPE_F16;
47 if (s == "bf16") return GGML_TYPE_BF16;
48 if (s == "q8_0") return GGML_TYPE_Q8_0;
49 if (s == "q4_0") return GGML_TYPE_Q4_0;
50 if (s == "q4_1") return GGML_TYPE_Q4_1;
51 if (s == "iq4_nl") return GGML_TYPE_IQ4_NL;
52 if (s == "q5_0") return GGML_TYPE_Q5_0;
53 if (s == "q5_1") return GGML_TYPE_Q5_1;
54 return GGML_TYPE_COUNT;
55}
56
57} // namespace cache_type
58
59// ===== KV SEQUENCE OPERATIONS =====
60// Thin wrappers — tenancy is built on top of these.
61
77inline bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0,
78 llama_pos p1) {
79 if (!ctx) {
80 LLOYAL_LOG_DEBUG("[kv::remove_range] ERROR: null context");
81 return false;
82 }
83
84 llama_memory_t mem = llama_get_memory(ctx);
85 bool success = llama_memory_seq_rm(mem, seq, p0, p1);
86
87 if (!success) {
88 LLOYAL_LOG_DEBUG("[kv::remove_range] FAILED: seq=%d, p0=%d, p1=%d", seq, p0,
89 p1);
90 LLOYAL_LOG_DEBUG("[kv::remove_range] Guard-rail reminder: Ensure "
91 "remove_range called BEFORE next llama_decode()");
92 } else {
93 LLOYAL_LOG_DEBUG("[kv::remove_range] OK: seq=%d, removed tokens [%d, %d)",
94 seq, p0, p1);
95 }
96
97 return success;
98}
99
110inline llama_pos pos_max(llama_context *ctx, llama_seq_id seq) {
111 if (!ctx) {
112 LLOYAL_LOG_DEBUG("[kv::pos_max] ERROR: null context");
113 return -1;
114 }
115
116 llama_memory_t mem = llama_get_memory(ctx);
117 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
118
119 LLOYAL_LOG_DEBUG("[kv::pos_max] seq=%d, max_pos=%d", seq, max_pos);
120 return max_pos;
121}
122
137inline void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst,
138 llama_pos p0 = 0, llama_pos p1 = -1) {
139 if (!ctx) {
140 LLOYAL_LOG_DEBUG("[kv::seq_cp] ERROR: null context");
141 return;
142 }
143
144 llama_memory_t mem = llama_get_memory(ctx);
145 llama_memory_seq_cp(mem, src, dst, p0, p1);
146
147 LLOYAL_LOG_DEBUG("[kv::seq_cp] Copied seq %d → %d [%d, %d)", src, dst, p0, p1);
148}
149
161inline void seq_keep(llama_context *ctx, llama_seq_id seq) {
162 if (!ctx) {
163 LLOYAL_LOG_DEBUG("[kv::seq_keep] ERROR: null context");
164 return;
165 }
166
167 llama_memory_t mem = llama_get_memory(ctx);
168 llama_memory_seq_keep(mem, seq);
169
170 LLOYAL_LOG_DEBUG("[kv::seq_keep] Kept only seq %d", seq);
171}
172
173// ===== KV TENANCY =====
174
197static_assert(std::is_signed_v<llama_seq_id>,
198 "llama_seq_id must be signed for NO_LEASE sentinel");
199
206constexpr llama_seq_id NO_LEASE = static_cast<llama_seq_id>(-1);
207
208namespace tenancy {
209
216struct State {
217 llama_context* ctx = nullptr;
218 llama_seq_id n_seq_max = 0;
219 std::vector<llama_seq_id> vacant;
220 std::vector<uint8_t> leased;
221};
222
233inline State init(llama_context* ctx, llama_seq_id n_seq_max) {
234 State s;
235 s.ctx = ctx;
236 s.n_seq_max = n_seq_max;
237 s.leased.resize(static_cast<size_t>(n_seq_max), 0);
238 s.vacant.reserve(static_cast<size_t>(n_seq_max));
239 for (llama_seq_id i = n_seq_max; i-- > 0; ) {
240 s.vacant.push_back(i);
241 }
242 return s;
243}
244
255inline llama_seq_id acquire(State& s) {
256 if (s.vacant.empty()) return NO_LEASE;
257 llama_seq_id seq = s.vacant.back();
258 s.vacant.pop_back();
259 s.leased[static_cast<size_t>(seq)] = 1;
260 return seq;
261}
262
274inline void release(State& s, llama_seq_id seq) {
275 assert(seq >= 0 && seq < s.n_seq_max && "release: seq out of range");
276 assert(s.leased[static_cast<size_t>(seq)] && "release: seq not leased");
277 s.leased[static_cast<size_t>(seq)] = 0;
278 s.vacant.push_back(seq);
279}
280
296inline void evict(State& s, llama_seq_id seq) {
297 assert(seq >= 0 && seq < s.n_seq_max && "evict: seq out of range");
298 assert(s.leased[static_cast<size_t>(seq)] && "evict: seq not leased");
299 remove_range(s.ctx, seq, 0, -1);
300 release(s, seq);
301}
302
319inline void retain(State& s, llama_seq_id keep) {
320 assert(keep >= 0 && keep < s.n_seq_max && "retain: keep seq out of range");
321 assert(s.leased[static_cast<size_t>(keep)] && "retain: keep seq not leased");
322 seq_keep(s.ctx, keep);
323#ifndef NDEBUG
324 for (llama_seq_id i = 0; i < s.n_seq_max; ++i) {
325 if (i != keep) assert(pos_max(s.ctx, i) < 0 && "retain: seq_keep left dirty tags");
326 }
327#endif
328 s.vacant.clear();
329 // Reverse order matches init() — LIFO means lowest seq_id acquired first
330 for (llama_seq_id i = s.n_seq_max; i-- > 0; ) {
331 if (i == keep) {
332 s.leased[static_cast<size_t>(i)] = 1;
333 } else {
334 s.leased[static_cast<size_t>(i)] = 0;
335 s.vacant.push_back(i);
336 }
337 }
338}
339
350inline void evict_all(State& s) {
351 for (llama_seq_id i = 0; i < s.n_seq_max; ++i) {
352 if (s.leased[static_cast<size_t>(i)]) {
353 evict(s, i);
354 }
355 }
356}
357
364inline size_t available(const State& s) {
365 return s.vacant.size();
366}
367
368} // namespace tenancy
369
// end of tenancy group
371
372// ===== STATE SNAPSHOT OPERATIONS =====
373
387inline size_t state_size(llama_context *ctx, llama_seq_id seq) {
388 if (!ctx) {
389 LLOYAL_LOG_DEBUG("[kv::state_size] ERROR: null context");
390 return 0;
391 }
392
393 llama_memory_t mem = llama_get_memory(ctx);
394 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
395 if (max_pos < 0) {
396 LLOYAL_LOG_DEBUG("[kv::state_size] WARNING: KV cache is empty (max_pos=%d) "
397 "- returning 0",
398 max_pos);
399 return 0;
400 }
401
402 size_t size = llama_state_seq_get_size(ctx, seq);
403
404 if (size == 0) {
406 "[kv::state_size] Per-sequence size query failed for seq=%d", seq);
408 "[kv::state_size] Attempting global state size (fallback)");
409 size = llama_state_get_size(ctx);
410
411 if (size > 0) {
412 LLOYAL_LOG_DEBUG("[kv::state_size] Global fallback size: %zu bytes",
413 size);
414 } else {
415 LLOYAL_LOG_DEBUG("[kv::state_size] ERROR: Both per-sequence and global "
416 "size queries failed");
417 }
418 } else {
420 "[kv::state_size] Per-sequence size for seq=%d: %zu bytes (%.1f MB)",
421 seq, size, size / 1024.0 / 1024.0);
422 }
423
424 return size;
425}
426
442inline size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst,
443 size_t size) {
444 if (!ctx || !dst || size == 0) {
446 "[kv::state_save] ERROR: invalid parameters (ctx=%p, dst=%p, size=%zu)",
447 ctx, dst, size);
448 return 0;
449 }
450
451 llama_memory_t mem = llama_get_memory(ctx);
452 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
453 if (max_pos < 0) {
454 LLOYAL_LOG_DEBUG("[kv::state_save] WARNING: KV cache is empty (max_pos=%d) "
455 "- skipping save",
456 max_pos);
457 return 0;
458 }
459
460 size_t written = llama_state_seq_get_data(ctx, dst, size, seq);
461
462 if (written == 0) {
463 LLOYAL_LOG_DEBUG("[kv::state_save] Per-sequence save failed for seq=%d "
464 "(possible KV fragmentation)",
465 seq);
467 "[kv::state_save] Attempting global state save (fallback)");
468 written = llama_state_get_data(ctx, dst, size);
469
470 if (written > 0) {
472 "[kv::state_save] Global fallback succeeded: %zu bytes (%.1f MB)",
473 written, written / 1024.0 / 1024.0);
474 } else {
476 "[kv::state_save] ERROR: Both per-sequence and global save failed");
477 }
478 } else {
480 "[kv::state_save] Per-sequence saved %zu bytes (%.1f MB) for seq=%d",
481 written, written / 1024.0 / 1024.0, seq);
482 }
483
484 return written;
485}
486
503inline size_t state_load(llama_context *ctx, llama_seq_id seq,
504 const uint8_t *src, size_t size) {
505 if (!ctx || !src || size == 0) {
507 "[kv::state_load] ERROR: invalid parameters (ctx=%p, src=%p, size=%zu)",
508 ctx, src, size);
509 return 0;
510 }
511
512 llama_memory_t mem = llama_get_memory(ctx);
513 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
514 if (max_pos < 0) {
515 LLOYAL_LOG_DEBUG("[kv::state_load] WARNING: KV cache is empty (max_pos=%d) "
516 "- loading may crash on recurrent models",
517 max_pos);
518 }
519
520 size_t read = llama_state_seq_set_data(ctx, src, size, seq);
521
522 if (read == 0) {
523 LLOYAL_LOG_DEBUG("[kv::state_load] Per-sequence restore failed for seq=%d "
524 "(possible fragmentation)",
525 seq);
527 "[kv::state_load] Attempting global state restore (fallback)");
528 read = llama_state_set_data(ctx, src, size);
529
530 if (read > 0) {
532 "[kv::state_load] Global fallback succeeded: %zu bytes (%.1f MB)",
533 read, read / 1024.0 / 1024.0);
534 } else {
535 LLOYAL_LOG_DEBUG("[kv::state_load] ERROR: Both per-sequence and global "
536 "restore failed");
537 }
538 } else {
540 "[kv::state_load] Per-sequence loaded %zu bytes (%.1f MB) for seq=%d",
541 read, read / 1024.0 / 1024.0, seq);
542 }
543
544 return read;
545}
546
547// ===== GLOBAL STATE OPERATIONS =====
548
558inline size_t global_state_size(llama_context *ctx) {
559 if (!ctx) {
560 LLOYAL_LOG_DEBUG("[kv::global_state_size] ERROR: null context");
561 return 0;
562 }
563
564 size_t size = llama_state_get_size(ctx);
565 LLOYAL_LOG_DEBUG("[kv::global_state_size] %zu bytes (%.1f MB)", size,
566 size / 1024.0 / 1024.0);
567 return size;
568}
569
580inline size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size) {
581 if (!ctx || !dst || size == 0) {
582 LLOYAL_LOG_DEBUG("[kv::global_state_save] ERROR: invalid parameters");
583 return 0;
584 }
585
586 size_t written = llama_state_get_data(ctx, dst, size);
587 LLOYAL_LOG_DEBUG("[kv::global_state_save] %zu bytes written (%.1f MB)",
588 written, written / 1024.0 / 1024.0);
589 return written;
590}
591
602inline size_t global_state_load(llama_context *ctx, const uint8_t *src,
603 size_t size) {
604 if (!ctx || !src || size == 0) {
605 LLOYAL_LOG_DEBUG("[kv::global_state_load] ERROR: invalid parameters");
606 return 0;
607 }
608
609 size_t read = llama_state_set_data(ctx, src, size);
610 LLOYAL_LOG_DEBUG("[kv::global_state_load] %zu bytes read (%.1f MB)", read,
611 read / 1024.0 / 1024.0);
612 return read;
613}
614
615// ===== DIAGNOSTICS =====
616
627inline void log_build_info(llama_context *ctx) {
629 "[kv::build_info] ============================================");
631 "[kv::build_info] llama.cpp KV Sequence Operations Configuration");
633 "[kv::build_info] ============================================");
634 LLOYAL_LOG_DEBUG("[kv::build_info] Version: b8087");
635 LLOYAL_LOG_DEBUG("[kv::build_info] API naming: llama_memory_seq_*");
637 "[kv::build_info] Current MVP: n_seq_max=1 (single sequence only)");
638
639 if (ctx) {
640 llama_pos max_pos = pos_max(ctx, 0);
641 if (max_pos >= 0) {
642 LLOYAL_LOG_DEBUG("[kv::build_info] Current KV cursor (seq 0): %d tokens",
643 max_pos);
644 } else {
645 LLOYAL_LOG_DEBUG("[kv::build_info] KV cache empty (seq 0)");
646 }
647
648 size_t snapshot_size = state_size(ctx, 0);
649 if (snapshot_size > 0) {
651 "[kv::build_info] Estimated snapshot size: %zu bytes (%.1f MB)",
652 snapshot_size, snapshot_size / 1024.0 / 1024.0);
653 }
654 }
655
657 "[kv::build_info] Fragmentation fallback: per-sequence → global state");
659 "[kv::build_info] Critical: Call remove_range() BEFORE llama_decode()");
661 "[kv::build_info] ============================================");
662}
663
664// ===== CACHE CLEARING =====
665
682inline void clear_all(llama_context *ctx) {
683 if (!ctx) {
684 LLOYAL_LOG_DEBUG("[kv::clear_all] ERROR: NULL context");
685 throw std::runtime_error("kv::clear_all - NULL context");
686 }
687
688 LLOYAL_LOG_DEBUG("[kv::clear_all] Clearing KV cache (metadata + data)");
689 llama_memory_clear(llama_get_memory(ctx), true); // true = clear data buffers too
690 LLOYAL_LOG_DEBUG("[kv::clear_all] KV cache cleared");
691}
692
707inline void clear_metadata(llama_context *ctx) {
708 if (!ctx) {
709 LLOYAL_LOG_DEBUG("[kv::clear_metadata] ERROR: NULL context");
710 throw std::runtime_error("kv::clear_metadata - NULL context");
711 }
712
713 LLOYAL_LOG_DEBUG("[kv::clear_metadata] Clearing KV cache metadata only");
714 llama_memory_clear(llama_get_memory(ctx), false); // false = keep data buffers
715 LLOYAL_LOG_DEBUG("[kv::clear_metadata] KV cache metadata cleared");
716}
717
718// ===== CONTEXT COMPRESSION =====
719
752inline void clear_and_reseed(llama_context *ctx,
753 const std::vector<llama_token> &original_sinks,
754 const std::vector<llama_token> &tail,
755 int32_t n_batch) {
756 if (!ctx) {
757 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] ERROR: null context");
758 throw std::runtime_error("kv::clear_and_reseed - NULL context");
759 }
760
761 if (original_sinks.empty() && tail.empty()) {
762 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] ERROR: both sinks and tail are empty");
763 throw std::runtime_error("kv::clear_and_reseed - no tokens to reseed");
764 }
765
766 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Starting reseed: %zu sinks + %zu tail = %zu total",
767 original_sinks.size(), tail.size(), original_sinks.size() + tail.size());
768
769 // Get memory handle
770 llama_memory_t mem = llama_get_memory(ctx);
771
772 // Log state before clear
773 llama_pos max_pos_before = llama_memory_seq_pos_max(mem, 0);
774 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Before clear: KV cache max_pos=%d", max_pos_before);
775
776 // Clear entire KV cache (simple and reliable)
777 llama_memory_clear(mem, true);
778
779 llama_pos max_pos_after_clear = llama_memory_seq_pos_max(mem, 0);
780 if (max_pos_after_clear != -1) {
781 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] WARNING: KV cache not empty after clear (max_pos=%d)",
782 max_pos_after_clear);
783 }
784
785 // Re-decode sinks at position 0
786 if (!original_sinks.empty()) {
787 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Re-decoding %zu sinks at position 0", original_sinks.size());
788 if (lloyal::decode::many(ctx, original_sinks, 0, n_batch) != 0) {
789 throw std::runtime_error("kv::clear_and_reseed - llama_decode failed on sinks");
790 }
791 }
792
793 // Re-decode tail at position sinks.size()
794 if (!tail.empty()) {
795 int32_t tail_start_pos = static_cast<int32_t>(original_sinks.size());
796 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Re-decoding %zu tail tokens at position %d",
797 tail.size(), tail_start_pos);
798 if (lloyal::decode::many(ctx, tail, tail_start_pos, n_batch) != 0) {
799 throw std::runtime_error("kv::clear_and_reseed - llama_decode failed on tail");
800 }
801 }
802
803 // Verify final state
804 llama_pos max_pos_after = llama_memory_seq_pos_max(mem, 0);
805 int32_t expected_pos = static_cast<int32_t>(original_sinks.size() + tail.size()) - 1;
806
807 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] After reseed: KV cache max_pos=%d (expected %d)",
808 max_pos_after, expected_pos);
809
810 if (max_pos_after != expected_pos) {
811 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] WARNING: Unexpected final position (got %d, expected %d)",
812 max_pos_after, expected_pos);
813 }
814
815 LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Reseed complete");
816}
817
818// ===== FILE PERSISTENCE =====
819
825struct FileData {
826 std::vector<llama_token> tokens;
827 size_t bytes_read;
828};
829
851inline size_t write_file(llama_context *ctx, llama_seq_id seq,
852 const std::string &filepath,
853 const std::vector<llama_token> &tokens) {
854 if (!ctx) {
855 LLOYAL_LOG_DEBUG("[kv::write_file] ERROR: null context");
856 return 0;
857 }
858
859 if (filepath.empty()) {
860 LLOYAL_LOG_DEBUG("[kv::write_file] ERROR: empty filepath");
861 return 0;
862 }
863
864 // Guard: Don't write if KV cache is empty
865 llama_memory_t mem = llama_get_memory(ctx);
866 llama_pos max_pos = llama_memory_seq_pos_max(mem, seq);
867 if (max_pos < 0) {
869 "[kv::write_file] WARNING: KV cache is empty - skipping write");
870 return 0;
871 }
872
873 // Delegate to llama.cpp's session file writer
874 // Note: llama.cpp signature is (ctx, filepath, seq_id, tokens, n_tokens)
875 size_t bytes = llama_state_seq_save_file(ctx, filepath.c_str(), seq,
876 tokens.data(), tokens.size());
877
878 if (bytes > 0) {
879 LLOYAL_LOG_DEBUG("[kv::write_file] Wrote %s: %zu bytes (%.1f MB), %zu "
880 "tokens",
881 filepath.c_str(), bytes, bytes / 1024.0 / 1024.0,
882 tokens.size());
883 } else {
884 LLOYAL_LOG_DEBUG("[kv::write_file] FAILED to write %s", filepath.c_str());
885 }
886
887 return bytes;
888}
889
910inline FileData read_file(llama_context *ctx, llama_seq_id seq,
911 const std::string &filepath) {
912 if (!ctx) {
913 throw std::runtime_error("[kv::read_file] null context");
914 }
915
916 if (filepath.empty()) {
917 throw std::runtime_error("[kv::read_file] empty filepath");
918 }
919
920 // Get model's n_ctx to allocate token buffer
921 const uint32_t n_ctx = llama_n_ctx(ctx);
922
923 std::vector<llama_token> tokens;
924 tokens.resize(n_ctx); // Allocate buffer with capacity
925
926 size_t token_count = 0;
927 // Note: llama.cpp signature is (ctx, filepath, seq_id, tokens_out, capacity, count_out)
928 size_t bytes =
929 llama_state_seq_load_file(ctx, filepath.c_str(), seq, tokens.data(),
930 tokens.size(), &token_count);
931
932 if (bytes == 0) {
933 throw std::runtime_error("[kv::read_file] failed to load from " +
934 filepath);
935 }
936
937 tokens.resize(token_count);
938
939 LLOYAL_LOG_DEBUG("[kv::read_file] Loaded %s: %zu bytes (%.1f MB), %zu tokens",
940 filepath.c_str(), bytes, bytes / 1024.0 / 1024.0,
941 token_count);
942
943 return FileData{std::move(tokens), bytes};
944}
945
946} // namespace lloyal::kv
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:47
Batch Decoding Operations.
constexpr llama_seq_id NO_LEASE
Sentinel value indicating a branch has no KV residency.
Definition kv.hpp:206
int many(llama_context *ctx, const llama_token *tokens, int32_t n_tokens, int32_t n_past, int32_t n_batch, llama_seq_id seq_id=0)
Decode multiple tokens into the KV cache with auto-chunking.
Definition decode.hpp:124
ggml_type from_str(const std::string &s)
Map string name to ggml_type enum (matches llama.cpp CLI -ctk/-ctv flags).
Definition kv.hpp:44
void evict_all(State &s)
Evict every leased seq_id.
Definition kv.hpp:350
llama_seq_id acquire(State &s)
Acquire a seq_id from the vacant pool.
Definition kv.hpp:255
size_t available(const State &s)
Number of vacant seq_ids available for acquisition.
Definition kv.hpp:364
void evict(State &s, llama_seq_id seq)
Evict a seq_id — strip all KV tags then release.
Definition kv.hpp:296
void retain(State &s, llama_seq_id keep)
Nuclear retain — keep one seq, rebuild vacancy from scratch.
Definition kv.hpp:319
State init(llama_context *ctx, llama_seq_id n_seq_max)
Initialize tenancy with all seq_ids vacant.
Definition kv.hpp:233
void release(State &s, llama_seq_id seq)
Release a seq_id back to vacant — bookkeeping only, no KV calls.
Definition kv.hpp:274
void seq_keep(llama_context *ctx, llama_seq_id seq)
Keep only one sequence, removing all others.
Definition kv.hpp:161
FileData read_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath)
Definition kv.hpp:910
void log_build_info(llama_context *ctx)
Log KV cache build info and current state.
Definition kv.hpp:627
size_t state_size(llama_context *ctx, llama_seq_id seq)
Get size needed to serialize sequence state.
Definition kv.hpp:387
void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst, llama_pos p0=0, llama_pos p1=-1)
Copy KV cache from one sequence to another.
Definition kv.hpp:137
void clear_and_reseed(llama_context *ctx, const std::vector< llama_token > &original_sinks, const std::vector< llama_token > &tail, int32_t n_batch)
Definition kv.hpp:752
size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst, size_t size)
Save sequence state to buffer.
Definition kv.hpp:442
void clear_metadata(llama_context *ctx)
Clear KV cache metadata only (fast reset)
Definition kv.hpp:707
size_t global_state_load(llama_context *ctx, const uint8_t *src, size_t size)
Restore global state from buffer.
Definition kv.hpp:602
size_t state_load(llama_context *ctx, llama_seq_id seq, const uint8_t *src, size_t size)
Restore sequence state from buffer.
Definition kv.hpp:503
llama_pos pos_max(llama_context *ctx, llama_seq_id seq)
Get maximum position in KV cache sequence.
Definition kv.hpp:110
void clear_all(llama_context *ctx)
Clear all KV cache (complete reset)
Definition kv.hpp:682
size_t global_state_size(llama_context *ctx)
Get size needed to serialize global state.
Definition kv.hpp:558
bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0, llama_pos p1)
Remove token range from KV cache sequence.
Definition kv.hpp:77
size_t write_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath, const std::vector< llama_token > &tokens)
Write KV state to file with self-describing format.
Definition kv.hpp:851
size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size)
Save global state to buffer.
Definition kv.hpp:580
Data structure returned by read_file.
Definition kv.hpp:825
std::vector< llama_token > tokens
Tokens restored from file.
Definition kv.hpp:826
size_t bytes_read
Total bytes read from file.
Definition kv.hpp:827
Tenancy state — tracks seq_id vacancy and leases.
Definition kv.hpp:216
llama_context * ctx
Context for KV operations (nullptr after drain)
Definition kv.hpp:217
std::vector< llama_seq_id > vacant
Available seq_ids (LIFO stack)
Definition kv.hpp:219
std::vector< uint8_t > leased
Bitmap: leased[seq] = 1 if issued.
Definition kv.hpp:220
llama_seq_id n_seq_max
Total seq_id capacity (from llama_n_seq_max)
Definition kv.hpp:218