Compare commits

...

18 Commits

Author SHA1 Message Date
Ruben Ortlam 5a7462237e remove duplicated init calls 2026-06-19 11:07:38 +02:00
Ruben Ortlam 79210e3046 cleanup unused variable 2026-06-19 11:07:38 +02:00
Ruben Ortlam 84c4214b39 precompute name->buft map, map GPU host types to CPU buft 2026-06-19 11:07:38 +02:00
Ruben Ortlam dbc5f7ec82 move model memory estimation to subprocess 2026-06-19 11:07:38 +02:00
Ruben Ortlam 384a495a00 extract duplicated check into helper function 2026-06-19 11:07:38 +02:00
Ruben Ortlam 997491a644 replace device memory map with buft memory map. Use llama_get_memory_breakdown 2026-06-19 11:07:38 +02:00
Georgi Gerganov a35afd504f cont : clean-up 2026-06-19 11:07:38 +02:00
Ruben Ortlam 3046b8853a also strip models memory margin from child processes 2026-06-19 11:05:24 +02:00
Ruben Ortlam 216aaf1ad6 improve variable naming, fix style 2026-06-19 11:05:24 +02:00
Ruben Ortlam ff41b3dbf7 improve memory_per_device map naming 2026-06-19 11:05:24 +02:00
Ruben Ortlam 0e2f08a535 fix model count exceeded check 2026-06-19 11:05:24 +02:00
Ruben Ortlam 669948ce12 move llama_context_device_memory function to llama-ext.h 2026-06-19 11:05:24 +02:00
Ruben Ortlam 09d8eb95a4 add server memory debug logging 2026-06-19 11:05:24 +02:00
Ruben Ortlam c749b6882c use memory margin instead of total size limit, apply to each device separately 2026-06-19 11:05:24 +02:00
Ruben Ortlam 4ed48154b0 only set model memory_mb if not previously calculated 2026-06-19 11:05:01 +02:00
Ruben Ortlam 6178b8755d use no_alloc to get memory requirements for model load 2026-06-19 11:05:01 +02:00
Ruben Ortlam 340c867179 estimate with to-be-loaded model size included 2026-06-19 11:05:01 +02:00
Ruben Ortlam f38c4f9419 server: add --models-memory-max parameter to allow dynamically unloading models when they exceed a memory size threshold 2026-06-19 11:05:01 +02:00
6 changed files with 331 additions and 35 deletions
+14
View File
@@ -3099,6 +3099,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.models_max = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg(
{"--models-memory-margin"}, "N",
string_format("for router server, MiB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin),
[](common_params & params, int value) {
params.models_memory_margin = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN"));
add_opt(common_arg(
{"--models-autoload"},
{"--no-models-autoload"},
@@ -3335,6 +3342,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.offline = true;
}
).set_env("LLAMA_ARG_OFFLINE"));
add_opt(common_arg(
{"--measure-only"},
"Load the model to measure memory requirements, print to stdout, then exit",
[](common_params & params) {
params.measure_only = true;
}
));
add_opt(common_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
+2
View File
@@ -511,6 +511,7 @@ struct common_params {
int32_t control_vector_layer_end = -1; // layer range for control vector
bool offline = false;
bool skip_download = false; // skip model file downloading
bool measure_only = false; // load model with no_alloc to measure memory, print to stdout, then exit
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -645,6 +646,7 @@ struct common_params {
std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server
int models_max = 4; // maximum number of models to load simultaneously
int models_memory_margin = 1024; // MiB of free memory to preserve per device (0 = disabled)
bool models_autoload = true; // automatically load models when requested via the router server
std::string models_preset_hf = ""; // show a warning about remote presets on router loaded (if not empty)
+11 -1
View File
@@ -90,7 +90,17 @@ LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * m
LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
// Set whether the context outputs nextn embeddings or not
// Returns the projected memory use (model + context + compute) in bytes
// for the given device within this context. Returns 0 if the device is not used.
LLAMA_API uint64_t llama_context_device_memory(
const struct llama_context * ctx,
ggml_backend_dev_t device);
//
// pre-norm embeddings (hidden state before the final output norm)
//
// Set whether the context outputs pre-norm embeddings or not
// If masked == true, output the embeddings only for the tokens with batch.logits != 0
// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits
LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked);
+230 -33
View File
@@ -130,6 +130,7 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
preset.unset_option("LLAMA_API_KEY");
preset.unset_option("LLAMA_ARG_MODELS_DIR");
preset.unset_option("LLAMA_ARG_MODELS_MAX");
preset.unset_option("LLAMA_ARG_MODELS_MEMORY_MARGIN");
preset.unset_option("LLAMA_ARG_MODELS_PRESET");
preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD");
if (unset_model_args) {
@@ -245,9 +246,39 @@ server_models::server_models(
bin_path = get_server_exec_path().string();
} catch (const std::exception & e) {
bin_path = argv[0];
LOG_WRN("failed to get server executable path: %s\n", e.what());
LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
SRV_WRN("failed to get server executable path: %s\n", e.what());
SRV_WRN("using original argv[0] as fallback: %s\n", argv[0]);
}
const size_t memory_margin = (size_t) base_params.models_memory_margin * 1024 * 1024;
if (memory_margin > 0) {
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
ggml_backend_buffer_type_t cpu_buft = cpu_dev ? ggml_backend_dev_buffer_type(cpu_dev) : nullptr;
const size_t n_devs = ggml_backend_dev_count();
for (size_t i = 0; i < n_devs; i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
ggml_backend_buffer_type_t dev_buft = ggml_backend_dev_buffer_type(dev);
if (dev_buft) {
buft_by_name[ggml_backend_buft_name(dev_buft)] = dev_buft;
}
ggml_backend_buffer_type_t host_buft = ggml_backend_dev_host_buffer_type(dev);
if (host_buft && cpu_buft) {
buft_by_name[ggml_backend_buft_name(host_buft)] = cpu_buft;
}
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
if (total > 0 && dev_buft) {
const size_t available = (free > memory_margin) ? free - memory_margin : 0;
bmm_available[dev_buft] = available;
SRV_DBG("buft %s: available memory after margin=%zu MiB\n",
ggml_backend_buft_name(dev_buft), available / (1024 * 1024));
}
}
}
load_models();
}
@@ -441,6 +472,7 @@ void server_models::load_models() {
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* bmm_req */ {},
/* args */ std::vector<std::string>(),
/* loaded_info */ {},
/* exit_code */ 0,
@@ -607,6 +639,7 @@ void server_models::load_models() {
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* bmm_req */ {},
/* args */ std::vector<std::string>(),
/* loaded_info */ {},
/* exit_code */ 0,
@@ -780,30 +813,87 @@ std::vector<server_model_meta> server_models::get_all_meta() {
return result;
}
void server_models::unload_lru() {
if (base_params.models_max <= 0) {
return; // no limit
}
// remove one of the servers if we passed the models_max (least recently used - LRU)
std::string lru_model_name = "";
int64_t lru_last_used = ggml_time_ms();
size_t count_active = 0;
{
std::unique_lock<std::mutex> lk(mutex);
for (const auto & m : mapping) {
if (m.second.meta.is_running()) {
count_active++;
if (m.second.meta.last_used < lru_last_used) {
lru_model_name = m.first;
lru_last_used = m.second.meta.last_used;
}
int server_models::can_fit(const buft_memory_map & bmm_req) const {
buft_memory_map bmm_total;
for (const auto & m : mapping) {
if (m.second.meta.is_running()) {
for (const auto & [buft, mem] : m.second.meta.bmm_req) {
bmm_total[buft] += mem;
}
}
}
if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
auto get = [](const buft_memory_map & dmm, ggml_backend_buffer_type_t buft) -> size_t {
auto it = dmm.find(buft);
return it != dmm.end() ? it->second : 0;
};
int res = 0;
for (const auto & [buft, limit] : bmm_available) {
const size_t mem_total = get(bmm_total, buft);
const size_t mem_new = get(bmm_req, buft);
SRV_DBG("buft %s: total=%zu MiB, new=%zu MiB, limit=%zu MiB\n",
ggml_backend_buft_name(buft),
mem_total / (1024 * 1024), mem_new / (1024 * 1024), limit / (1024 * 1024));
if (mem_total + mem_new > limit) {
res++;
}
}
return res;
}
bool server_models::limits_exceeded(const buft_memory_map & bmm_req) const {
const bool check_active = base_params.models_max > 0;
const bool check_memory = base_params.models_memory_margin > 0;
if (!check_active && !check_memory) {
return false;
}
int count_active = 0;
for (const auto & m : mapping) {
if (m.second.meta.is_running()) {
count_active++;
}
}
const bool active_exceeded = check_active && count_active >= base_params.models_max;
const bool memory_exceeded = check_memory && can_fit(bmm_req) > 0;
return active_exceeded || memory_exceeded;
}
void server_models::unload_lru(const buft_memory_map & bmm_req) {
if (base_params.models_memory_margin > 0) {
GGML_ASSERT(!bmm_available.empty());
}
while (true) {
std::string lru_model_name;
{
std::unique_lock<std::mutex> lk(mutex);
if (!limits_exceeded(bmm_req)) {
break;
}
int64_t lru_last_used = ggml_time_ms();
for (const auto & m : mapping) {
if (m.second.meta.is_running() && m.second.meta.last_used < lru_last_used) {
lru_model_name = m.first;
lru_last_used = m.second.meta.last_used;
}
}
}
if (lru_model_name.empty()) {
break;
}
SRV_INF("limits exceeded, removing LRU name=%s\n", lru_model_name.c_str());
unload(lru_model_name);
// wait for unload to complete
{
std::unique_lock<std::mutex> lk(mutex);
cv.wait(lk, [this, &lru_model_name]() {
@@ -813,11 +903,114 @@ void server_models::unload_lru() {
}
}
buft_memory_map server_models::estimate_model_memory(const std::string & name) {
std::vector<std::string> child_args;
std::vector<std::string> child_env;
{
std::lock_guard<std::mutex> lk(mutex);
auto & meta = mapping[name].meta;
child_args = meta.preset.to_args(bin_path);
child_env = base_env;
}
child_args.push_back("--measure-only");
child_args.push_back("--offline");
SRV_INF("estimating memory for model name=%s\n", name.c_str());
std::vector<char *> argv = to_char_ptr_array(child_args);
std::vector<char *> envp = to_char_ptr_array(child_env);
subprocess_s proc;
int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
if (subprocess_create_ex(argv.data(), options, envp.data(), &proc) != 0) {
SRV_ERR("failed to spawn measure process for model name=%s\n", name.c_str());
return {};
}
buft_memory_map result;
FILE * out = subprocess_stdout(&proc);
if (out) {
char buffer[4096];
while (fgets(buffer, sizeof(buffer), out) != nullptr) {
LOG("[measure:%s] %s", name.c_str(), buffer);
std::string line(buffer);
if (string_starts_with(line, "measure:")) {
std::istringstream iss(line.substr(strlen("measure:")));
std::string buft_name;
size_t size = 0;
if (iss >> buft_name >> size) {
auto it = buft_by_name.find(buft_name);
if (it != buft_by_name.end()) {
result[it->second] += size;
} else {
SRV_WRN("unknown buft name '%s' from measure child for model name=%s\n",
buft_name.c_str(), name.c_str());
}
}
}
}
}
int exit_code = 0;
subprocess_join(&proc, &exit_code);
subprocess_destroy(&proc);
if (exit_code != 0) {
SRV_ERR("measure process for model name=%s exited with code %d\n", name.c_str(), exit_code);
return {};
}
SRV_INF("memory estimation complete for model name=%s\n", name.c_str());
return result;
}
void server_models::join_completed_bg_tasks() {
std::vector<std::unique_ptr<bg_task>> to_join;
{
std::lock_guard<std::mutex> lk(mutex);
for (auto it = bg_tasks.begin(); it != bg_tasks.end(); ) {
if (it->second->done.load()) {
to_join.push_back(std::move(it->second));
it = bg_tasks.erase(it);
} else {
++it;
}
}
}
for (auto & task : to_join) {
if (task->th.joinable()) {
task->th.join();
}
}
}
void server_models::load(const std::string & name) {
if (!has_model(name)) {
throw std::runtime_error("model name=" + name + " is not found");
}
unload_lru();
join_completed_bg_tasks();
buft_memory_map bmm_req;
if (base_params.models_memory_margin > 0) {
{
std::lock_guard<std::mutex> lk(mutex);
bmm_req = mapping[name].meta.bmm_req;
}
if (bmm_req.empty()) {
bmm_req = estimate_model_memory(name);
if (bmm_req.empty()) {
SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str());
}
{
std::lock_guard<std::mutex> lk(mutex);
mapping[name].meta.bmm_req = bmm_req;
}
}
}
unload_lru(bmm_req);
std::unique_lock<std::mutex> lk(mutex);
// edge case: block until any in-progress reload has finished so we always load
@@ -834,16 +1027,8 @@ void server_models::load(const std::string & name) {
// exceeding models_max. Without this, the window between unload_lru()
// releasing its lock and this lock_guard acquiring allows multiple
// threads to each observe capacity and all proceed to load.
if (base_params.models_max > 0) {
size_t count_active = 0;
for (const auto & m : mapping) {
if (m.second.meta.is_running()) {
count_active++;
}
}
if (count_active >= (size_t)base_params.models_max) {
throw std::runtime_error("model limit reached, try again later");
}
if (limits_exceeded(bmm_req)) {
throw std::runtime_error("model limit reached, try again later");
}
// prepare new instance info
@@ -1107,6 +1292,7 @@ void server_models::unload(const std::string & name) {
void server_models::unload_all() {
std::vector<std::thread> to_join;
std::vector<std::unique_ptr<bg_task>> bg_to_join;
{
std::lock_guard<std::mutex> lk(mutex);
for (auto & [name, inst] : mapping) {
@@ -1122,15 +1308,26 @@ void server_models::unload_all() {
// moving the thread to join list to avoid deadlock
to_join.push_back(std::move(inst.th));
}
for (auto & [name, task] : bg_tasks) {
bg_to_join.push_back(std::move(task));
}
bg_tasks.clear();
}
for (auto & th : to_join) {
if (th.joinable()) {
th.join();
}
}
for (auto & task : bg_to_join) {
if (task && task->th.joinable()) {
task->th.join();
}
}
}
void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
join_completed_bg_tasks();
std::unique_lock<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
+34 -1
View File
@@ -7,11 +7,13 @@
#include "server-http.h"
#include "server-queue.h"
#include <atomic>
#include <mutex>
#include <condition_variable>
#include <functional>
#include <memory>
#include <set>
#include <unordered_map>
/**
* state diagram:
@@ -61,6 +63,8 @@ static std::string server_model_source_to_string(server_model_source source) {
}
}
using buft_memory_map = std::map<ggml_backend_buffer_type_t, size_t>;
struct server_model_meta {
server_model_source source = SERVER_MODEL_SOURCE_CACHE;
common_preset preset;
@@ -70,6 +74,7 @@ struct server_model_meta {
int port = 0;
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
int64_t last_used = 0; // for LRU unloading
buft_memory_map bmm_req; // bytes required per buffer type
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
json loaded_info; // info to be reflected via /v1/models endpoint ; if in DOWNLOADING state, it should contain download progress info
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
@@ -115,6 +120,13 @@ private:
std::condition_variable cv_stop;
std::set<std::string> stopping_models;
// background tasks for download/estimate/load pipelines, keyed by model name
struct bg_task {
std::thread th;
std::atomic<bool> done{false};
};
std::map<std::string, std::unique_ptr<bg_task>> bg_tasks;
// set to true while load_models() is executing a reload; load() will wait until clear
bool is_reloading = false;
@@ -128,10 +140,16 @@ private:
std::vector<std::string> base_env;
common_preset base_preset; // base preset from llama-server CLI args
// available memory per buffer type
buft_memory_map bmm_available;
// buft name -> buft lookup (host buffer types map to CPU buft)
std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_by_name;
void update_meta(const std::string & name, const server_model_meta & meta);
// unload least recently used models if the limit is reached
void unload_lru();
void unload_lru(const buft_memory_map & bmm_req);
// not thread-safe, caller must hold mutex
void add_model(server_model_meta && meta);
@@ -139,6 +157,21 @@ private:
// notify SSE clients
void notify_sse(const std::string & event, const std::string & model_id, const json & data = nullptr);
// return number of buffer types where the memory limit would be exceeded
// return 0 if the new model would fit
// not thread-safe, caller must hold mutex
int can_fit(const buft_memory_map & bmm_req) const;
// check if active model count or memory limits would be exceeded
// not thread-safe, caller must hold mutex
bool limits_exceeded(const buft_memory_map & bmm_req) const;
// estimate model memory by spawning a child process with --measure-only
// returns the buft memory map, or empty map on failure (caller must NOT hold mutex)
buft_memory_map estimate_model_memory(const std::string & name);
// join and remove completed background tasks
void join_completed_bg_tasks();
public:
server_models(const common_params & params, int argc, char ** argv);
+40
View File
@@ -11,6 +11,8 @@
#include "llama.h"
#include "log.h"
#include "../../src/llama-ext.h"
#include <atomic>
#include <clocale>
#include <exception>
@@ -120,6 +122,44 @@ int llama_server(int argc, char ** argv) {
// struct that contains llama context and inference
server_context ctx_server;
if (params.measure_only) {
llama_model_params mparams = common_model_params_to_llama(params);
mparams.no_alloc = true;
mparams.use_mmap = false;
mparams.use_mlock = false;
llama_model_ptr model{llama_model_load_from_file(params.model.path.c_str(), mparams)};
if (!model) {
LOG_ERR("%s: failed to load model for measurement\n", __func__);
llama_backend_free();
return 1;
}
llama_context_params cparams = common_context_params_to_llama(params);
llama_context_ptr ctx{llama_init_from_model(model.get(), cparams)};
if (!ctx) {
LOG_ERR("%s: failed to create context for measurement\n", __func__);
llama_backend_free();
return 1;
}
common_log_pause(common_log_main());
for (const auto & [buft, data] : llama_get_memory_breakdown(ctx.get())) {
size_t total = data.total();
if (total > 0) {
fprintf(stdout, "measure:%s %zu\n", ggml_backend_buft_name(buft), total);
}
}
fflush(stdout);
common_log_resume(common_log_main());
llama_backend_free();
return 0;
}
LOG_INF("build_info: %s\n", llama_build_info());
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
server_http_context ctx_http;
if (!ctx_http.init(params)) {
SRV_ERR("%s", "failed to initialize HTTP server\n");