Thread-safe weak-pointer cache for sharing llama_model instances.
Thread-safe weak-pointer cache for sharing llama_model instancesAvoids redundant model loads when multiple contexts use the same GGUF file. The cache stores weak_ptrs keyed by (path, n_gpu_layers, use_mmap).
#pragma once
#include <functional>
#include <llama/llama.h>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
struct ModelKey {
}
};
#if defined(__GNUC__) || defined(__clang__)
#define LLOYAL_NO_SANITIZE_OVERFLOW __attribute__((no_sanitize("unsigned-integer-overflow")))
#else
#define LLOYAL_NO_SANITIZE_OVERFLOW
#endif
struct ModelKeyHash {
std::hash<std::string> Hs;
std::hash<int> Hi;
std::hash<bool> Hb;
return Hs(k.canonPath) ^
(Hi(k.n_gpu_layers) + 0x9e3779b9 + (Hb(k.use_mmap) << 6));
}
};
#undef LLOYAL_NO_SANITIZE_OVERFLOW
class ModelRegistry {
public:
static std::shared_ptr<llama_model>
acquire(
const std::string &fsPath,
const llama_model_params ¶ms);
private:
inline static std::mutex mu_;
inline static std::unordered_map<ModelKey, std::weak_ptr<llama_model>,
ModelKeyHash>
cache_;
static ModelKey makeKey(const std::string &fsPath,
const llama_model_params ¶ms);
};
}
inline void freeModel(llama_model *model) {
"[ModelRegistry] Freeing model: ptr=%p (last reference released)",
(void *)model);
llama_model_free(model);
}
}
inline ModelKey ModelRegistry::makeKey(const std::string &fsPath,
const llama_model_params ¶ms) {
std::string canonPath = fsPath;
const std::string filePrefix = "file://";
if (canonPath.substr(0, filePrefix.length()) == filePrefix) {
canonPath = canonPath.substr(filePrefix.length());
}
return {canonPath, params.n_gpu_layers, params.use_mmap};
}
inline std::shared_ptr<llama_model>
const llama_model_params ¶ms) {
ModelKey key = makeKey(fsPath, params);
"n_gpu_layers=%d, use_mmap=%s",
key.canonPath.c_str(), key.n_gpu_layers,
key.use_mmap ? "true" : "false");
std::lock_guard<std::mutex> lock(mu_);
auto cacheEntry = cache_.find(key);
if (cacheEntry != cache_.end()) {
if (auto existingModel = cacheEntry->second.lock()) {
long refCount = existingModel.use_count();
"[ModelRegistry] Cache HIT - Reusing model: ptr=%p, refcount=%ld",
(void *)existingModel.get(), refCount);
return existingModel;
} else {
"removing stale entry");
cache_.erase(cacheEntry);
}
}
key.use_mmap ? "enabled" : "disabled");
llama_model *rawModel =
llama_model_load_from_file(key.canonPath.c_str(), params);
if (!rawModel) {
"[ModelRegistry] ERROR: llama_model_load_from_file returned NULL");
return nullptr;
}
size_t modelSize = llama_model_size(rawModel);
modelSize / (1024.0 * 1024.0));
auto sharedModel = std::shared_ptr<llama_model>(rawModel, detail::freeModel);
cache_[key] = sharedModel;
"shared_ptr (refcount=1)");
return sharedModel;
}
}
static std::shared_ptr< llama_model > acquire(const std::string &fsPath, const llama_model_params ¶ms)
Acquire a model from cache, or load from disk on cache miss.
#define LLOYAL_LOG_DEBUG(...)
liblloyal - Common definitions and logging
#define LLOYAL_NO_SANITIZE_OVERFLOW
Hash functor for ModelKey.
Boundary Tracker Stub for OSS liblloyal.
size_t operator()(const ModelKey &k) const
Compute hash for ModelKey.
std::string canonPath
Normalized file path (file:// prefix removed)
bool use_mmap
Whether to use memory mapping.
int n_gpu_layers
Number of layers offloaded to GPU (-1 = all)
bool operator==(const ModelKey &o) const