liblloyal 1.0.0
Branched Inference for llama.cpp
Loading...
Searching...
No Matches
model_registry.hpp
Go to the documentation of this file.
1#pragma once
2
3// SPDX-License-Identifier: Apache-2.0
4// Copyright 2026 Lloyal Labs
5
6
7#include "common.hpp"
8#include <functional>
9#include <llama/llama.h>
10#include <memory>
11#include <mutex>
12#include <string>
13#include <unordered_map>
14
28namespace lloyal {
29
36struct ModelKey {
37 std::string canonPath;
39 bool use_mmap;
40
41 bool operator==(const ModelKey &o) const {
42 return n_gpu_layers == o.n_gpu_layers && use_mmap == o.use_mmap &&
44 }
45};
46
53// Compiler-specific sanitizer suppression (GCC/Clang only; no-op on MSVC)
54#if defined(__GNUC__) || defined(__clang__)
55 #define LLOYAL_NO_SANITIZE_OVERFLOW __attribute__((no_sanitize("unsigned-integer-overflow")))
56#else
57 #define LLOYAL_NO_SANITIZE_OVERFLOW
58#endif
59
70 size_t operator()(const ModelKey &k) const {
71 std::hash<std::string> Hs;
72 std::hash<int> Hi;
73 std::hash<bool> Hb;
74 return Hs(k.canonPath) ^
75 (Hi(k.n_gpu_layers) + 0x9e3779b9 + (Hb(k.use_mmap) << 6));
76 }
77};
78
79#undef LLOYAL_NO_SANITIZE_OVERFLOW
80
117public:
129 static std::shared_ptr<llama_model> acquire(const std::string &fsPath,
130 const llama_model_params &params);
131
132private:
134 inline static std::mutex mu_;
135
137 inline static std::unordered_map<ModelKey, std::weak_ptr<llama_model>,
139 cache_;
140
151 static ModelKey makeKey(const std::string &fsPath,
152 const llama_model_params &params);
153};
154
155} // namespace lloyal
156
157namespace lloyal::detail {
158
167inline void freeModel(llama_model *model) {
169 "[ModelRegistry] Freeing model: ptr=%p (last reference released)",
170 (void *)model);
171 llama_model_free(model);
172 LLOYAL_LOG_DEBUG("[ModelRegistry] Model freed: ptr=%p", (void *)model);
173}
174
175} // namespace lloyal::detail
176
177namespace lloyal {
178
179// ===== IMPLEMENTATION =====
180
181// Normalize path to ensure "file:///path" and "/path" map to same key
182inline ModelKey ModelRegistry::makeKey(const std::string &fsPath,
183 const llama_model_params &params) {
184 // Inline path normalization (removes file:// prefix if present)
185 std::string canonPath = fsPath;
186 const std::string filePrefix = "file://";
187 if (canonPath.substr(0, filePrefix.length()) == filePrefix) {
188 canonPath = canonPath.substr(filePrefix.length());
189 }
190
191 return {canonPath, params.n_gpu_layers, params.use_mmap};
192}
193
194// Acquire model from cache or load new
195// 1. Check cache (thread-safe)
196// 2. Return existing if found (cache hit)
197// 3. Load new if expired (cache miss)
198// 4. Store as weak_ptr, return shared_ptr
199inline std::shared_ptr<llama_model>
200ModelRegistry::acquire(const std::string &fsPath,
201 const llama_model_params &params) {
202 ModelKey key = makeKey(fsPath, params);
203
204 LLOYAL_LOG_DEBUG("[ModelRegistry] Acquiring model: path='%s', "
205 "n_gpu_layers=%d, use_mmap=%s",
206 key.canonPath.c_str(), key.n_gpu_layers,
207 key.use_mmap ? "true" : "false");
208
209 std::lock_guard<std::mutex> lock(mu_);
210
211 auto cacheEntry = cache_.find(key);
212 if (cacheEntry != cache_.end()) {
213 // Try to upgrade weak_ptr to shared_ptr
214 if (auto existingModel = cacheEntry->second.lock()) {
215 long refCount = existingModel.use_count();
217 "[ModelRegistry] Cache HIT - Reusing model: ptr=%p, refcount=%ld",
218 (void *)existingModel.get(), refCount);
219 return existingModel;
220 } else {
221 LLOYAL_LOG_DEBUG("[ModelRegistry] Cache entry expired (model was freed), "
222 "removing stale entry");
223 cache_.erase(cacheEntry);
224 }
225 }
226
227 LLOYAL_LOG_DEBUG("[ModelRegistry] Cache MISS - Loading NEW model from disk");
228 LLOYAL_LOG_DEBUG("[ModelRegistry] Path: %s", key.canonPath.c_str());
229 LLOYAL_LOG_DEBUG("[ModelRegistry] GPU layers: %d", key.n_gpu_layers);
230 LLOYAL_LOG_DEBUG("[ModelRegistry] Memory mapping: %s",
231 key.use_mmap ? "enabled" : "disabled");
232
233 llama_model *rawModel =
234 llama_model_load_from_file(key.canonPath.c_str(), params);
235
236 if (!rawModel) {
237 // Let caller handle error (will throw structured error)
239 "[ModelRegistry] ERROR: llama_model_load_from_file returned NULL");
240 return nullptr;
241 }
242
243 size_t modelSize = llama_model_size(rawModel);
244 LLOYAL_LOG_DEBUG("[ModelRegistry] Model loaded:");
245 LLOYAL_LOG_DEBUG("[ModelRegistry] Pointer: %p", (void *)rawModel);
246 LLOYAL_LOG_DEBUG("[ModelRegistry] Size: %zu bytes (%.2f MB)", modelSize,
247 modelSize / (1024.0 * 1024.0));
248
249 auto sharedModel = std::shared_ptr<llama_model>(rawModel, detail::freeModel);
250
251 // Store as weak_ptr (allows automatic cleanup when all contexts release the
252 // model)
253 cache_[key] = sharedModel;
254 LLOYAL_LOG_DEBUG("[ModelRegistry] Model cached as weak_ptr, returning "
255 "shared_ptr (refcount=1)");
256
257 return sharedModel;
258}
259
260} // namespace lloyal
static std::shared_ptr< llama_model > acquire(const std::string &fsPath, const llama_model_params &params)
Acquire a model from cache, or load from disk on cache miss.
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
Definition common.hpp:48
#define LLOYAL_NO_SANITIZE_OVERFLOW
Hash functor for ModelKey.
Boundary Tracker Stub for OSS liblloyal.
size_t operator()(const ModelKey &k) const
Compute hash for ModelKey.
Model cache key combining file path and GPU configuration.
std::string canonPath
Normalized file path (file:// prefix removed)
bool use_mmap
Whether to use memory mapping.
int n_gpu_layers
Number of layers offloaded to GPU (-1 = all)
bool operator==(const ModelKey &o) const