9#include <llama/llama.h>
13#include <unordered_map>
54#if defined(__GNUC__) || defined(__clang__)
55 #define LLOYAL_NO_SANITIZE_OVERFLOW __attribute__((no_sanitize("unsigned-integer-overflow")))
57 #define LLOYAL_NO_SANITIZE_OVERFLOW
71 std::hash<std::string> Hs;
79#undef LLOYAL_NO_SANITIZE_OVERFLOW
129 static std::shared_ptr<llama_model>
acquire(
const std::string &fsPath,
130 const llama_model_params ¶ms);
134 inline static std::mutex mu_;
137 inline static std::unordered_map<ModelKey, std::weak_ptr<llama_model>,
151 static ModelKey makeKey(
const std::string &fsPath,
152 const llama_model_params ¶ms);
167inline void freeModel(llama_model *model) {
169 "[ModelRegistry] Freeing model: ptr=%p (last reference released)",
171 llama_model_free(model);
182inline ModelKey ModelRegistry::makeKey(
const std::string &fsPath,
183 const llama_model_params ¶ms) {
185 std::string canonPath = fsPath;
186 const std::string filePrefix =
"file://";
187 if (canonPath.substr(0, filePrefix.length()) == filePrefix) {
188 canonPath = canonPath.substr(filePrefix.length());
191 return {canonPath, params.
n_gpu_layers, params.use_mmap};
199inline std::shared_ptr<llama_model>
201 const llama_model_params ¶ms) {
202 ModelKey key = makeKey(fsPath, params);
205 "n_gpu_layers=%d, use_mmap=%s",
209 std::lock_guard<std::mutex> lock(mu_);
211 auto cacheEntry = cache_.find(key);
212 if (cacheEntry != cache_.end()) {
214 if (
auto existingModel = cacheEntry->second.lock()) {
215 long refCount = existingModel.use_count();
217 "[ModelRegistry] Cache HIT - Reusing model: ptr=%p, refcount=%ld",
218 (
void *)existingModel.get(), refCount);
219 return existingModel;
222 "removing stale entry");
223 cache_.erase(cacheEntry);
227 LLOYAL_LOG_DEBUG(
"[ModelRegistry] Cache MISS - Loading NEW model from disk");
231 key.
use_mmap ?
"enabled" :
"disabled");
233 llama_model *rawModel =
234 llama_model_load_from_file(key.
canonPath.c_str(), params);
239 "[ModelRegistry] ERROR: llama_model_load_from_file returned NULL");
243 size_t modelSize = llama_model_size(rawModel);
247 modelSize / (1024.0 * 1024.0));
249 auto sharedModel = std::shared_ptr<llama_model>(rawModel, detail::freeModel);
253 cache_[key] = sharedModel;
255 "shared_ptr (refcount=1)");
static std::shared_ptr< llama_model > acquire(const std::string &fsPath, const llama_model_params ¶ms)
Acquire a model from cache, or load from disk on cache miss.
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
#define LLOYAL_NO_SANITIZE_OVERFLOW
Hash functor for ModelKey.
Boundary Tracker Stub for OSS liblloyal.
size_t operator()(const ModelKey &k) const
Compute hash for ModelKey.
Model cache key combining file path and GPU configuration.
std::string canonPath
Normalized file path (file:// prefix removed)
bool use_mmap
Whether to use memory mapping.
int n_gpu_layers
Number of layers offloaded to GPU (-1 = all)
bool operator==(const ModelKey &o) const