liblloyal 1.0.0
Composable primitives for llama.cpp inference
Loading...
Searching...
No Matches
model_registry.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
6#include "common.hpp"
7#include <functional>
8#include <llama/llama.h>
9#include <memory>
10#include <mutex>
11#include <string>
12#include <unordered_map>
13
27namespace lloyal {
28
35struct ModelKey {
36 std::string canonPath;
38 bool use_mmap;
39
40 bool operator==(const ModelKey &o) const {
41 return n_gpu_layers == o.n_gpu_layers && use_mmap == o.use_mmap &&
43 }
44};
45
61 size_t operator()(const ModelKey &k) const {
62 std::hash<std::string> Hs;
63 std::hash<int> Hi;
64 std::hash<bool> Hb;
65 return Hs(k.canonPath) ^
66 (Hi(k.n_gpu_layers) + 0x9e3779b9 + (Hb(k.use_mmap) << 6));
67 }
68};
69
77public:
85 static std::shared_ptr<llama_model> acquire(const std::string &fsPath,
86 const llama_model_params &params);
87
88private:
92 inline static std::mutex mu_;
93
97 inline static std::unordered_map<ModelKey, std::weak_ptr<llama_model>,
99 cache_;
100
111 static ModelKey makeKey(const std::string &fsPath,
112 const llama_model_params &params);
113};
114
115} // namespace lloyal
116
117namespace lloyal::detail {
118
123inline void freeModel(llama_model *model) {
125 "[ModelRegistry] Freeing model: ptr=%p (last reference released)",
126 (void *)model);
127 llama_model_free(model);
128 LLOYAL_LOG_DEBUG("[ModelRegistry] Model freed: ptr=%p", (void *)model);
129}
130
131} // namespace lloyal::detail
132
133namespace lloyal {
134
135// ===== IMPLEMENTATION =====
136
137// Normalize path to ensure "file:///path" and "/path" map to same key
138inline ModelKey ModelRegistry::makeKey(const std::string &fsPath,
139 const llama_model_params &params) {
140 // Inline path normalization (removes file:// prefix if present)
141 std::string canonPath = fsPath;
142 const std::string filePrefix = "file://";
143 if (canonPath.substr(0, filePrefix.length()) == filePrefix) {
144 canonPath = canonPath.substr(filePrefix.length());
145 }
146
147 return {canonPath, params.n_gpu_layers, params.use_mmap};
148}
149
150// Acquire model from cache or load new
151// 1. Check cache (thread-safe)
152// 2. Return existing if found (cache hit)
153// 3. Load new if expired (cache miss)
154// 4. Store as weak_ptr, return shared_ptr
155inline std::shared_ptr<llama_model>
156ModelRegistry::acquire(const std::string &fsPath,
157 const llama_model_params &params) {
158 ModelKey key = makeKey(fsPath, params);
159
160 LLOYAL_LOG_DEBUG("[ModelRegistry] Acquiring model: path='%s', "
161 "n_gpu_layers=%d, use_mmap=%s",
162 key.canonPath.c_str(), key.n_gpu_layers,
163 key.use_mmap ? "true" : "false");
164
165 std::lock_guard<std::mutex> lock(mu_);
166
167 auto cacheEntry = cache_.find(key);
168 if (cacheEntry != cache_.end()) {
169 // Try to upgrade weak_ptr to shared_ptr
170 if (auto existingModel = cacheEntry->second.lock()) {
171 long refCount = existingModel.use_count();
173 "[ModelRegistry] Cache HIT - Reusing model: ptr=%p, refcount=%ld",
174 (void *)existingModel.get(), refCount);
175 return existingModel;
176 } else {
177 LLOYAL_LOG_DEBUG("[ModelRegistry] Cache entry expired (model was freed), "
178 "removing stale entry");
179 cache_.erase(cacheEntry);
180 }
181 }
182
183 LLOYAL_LOG_DEBUG("[ModelRegistry] Cache MISS - Loading NEW model from disk");
184 LLOYAL_LOG_DEBUG("[ModelRegistry] Path: %s", key.canonPath.c_str());
185 LLOYAL_LOG_DEBUG("[ModelRegistry] GPU layers: %d", key.n_gpu_layers);
186 LLOYAL_LOG_DEBUG("[ModelRegistry] Memory mapping: %s",
187 key.use_mmap ? "enabled" : "disabled");
188
189 llama_model *rawModel =
190 llama_model_load_from_file(key.canonPath.c_str(), params);
191
192 if (!rawModel) {
193 // Let caller handle error (will throw structured error)
195 "[ModelRegistry] ERROR: llama_model_load_from_file returned NULL");
196 return nullptr;
197 }
198
199 size_t modelSize = llama_model_size(rawModel);
200 LLOYAL_LOG_DEBUG("[ModelRegistry] Model loaded:");
201 LLOYAL_LOG_DEBUG("[ModelRegistry] Pointer: %p", (void *)rawModel);
202 LLOYAL_LOG_DEBUG("[ModelRegistry] Size: %zu bytes (%.2f MB)", modelSize,
203 modelSize / (1024.0 * 1024.0));
204
205 auto sharedModel = std::shared_ptr<llama_model>(rawModel, detail::freeModel);
206
207 // Store as weak_ptr (allows automatic cleanup when all contexts release the
208 // model)
209 cache_[key] = sharedModel;
210 LLOYAL_LOG_DEBUG("[ModelRegistry] Model cached as weak_ptr, returning "
211 "shared_ptr (refcount=1)");
212
213 return sharedModel;
214}
215
216} // namespace lloyal
Thread-safe registry for sharing llama_model instances.
static std::shared_ptr< llama_model > acquire(const std::string &fsPath, const llama_model_params &params)
Acquire a model from cache or load if not present.
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:47
JSON Schema to Grammar Converter (Header-Only)
Hash functor for ModelKey.
size_t operator()(const ModelKey &k) const
Compute hash for ModelKey.
Model cache key combining file path and GPU configuration.
std::string canonPath
Normalized file path (file:// prefix removed)
bool use_mmap
Whether to use memory mapping.
int n_gpu_layers
Number of layers offloaded to GPU (-1 = all)
bool operator==(const ModelKey &o) const