Compare commits

..

3 Commits

Author SHA1 Message Date
Charles Xu
eff06702b2 kleidiai : update to v1.24.0 and use release archive (#22549) 2026-05-04 22:13:31 +03:00
leonardHONG
e77056f9b2 CUDA: use fastdiv for batch index split in get_rows (#22650) 2026-05-04 16:24:05 +02:00
Xuan-Son Nguyen
935a340292 server: implement /models?reload=1 (#21848) 2026-05-04 16:23:26 +02:00
7 changed files with 337 additions and 102 deletions

View File

@@ -578,13 +578,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# Fetch KleidiAI sources:
include(FetchContent)
set(KLEIDIAI_COMMIT_TAG "v1.22.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "54049037570ab0ee0a0d126b2ba5ece1")
set(KLEIDIAI_COMMIT_TAG "v1.24.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/releases/download/${KLEIDIAI_COMMIT_TAG}/kleidiai-${KLEIDIAI_COMMIT_TAG}-src.tar.gz")
set(KLEIDIAI_RELEASE_ARCHIVE_MD5 "2f02ebe29573d45813e671eb304f2a00")
set(KLEIDIAI_FETCH_ARGS
URL ${KLEIDIAI_DOWNLOAD_URL}
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
URL_HASH MD5=${KLEIDIAI_RELEASE_ARCHIVE_MD5}
)
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
list(APPEND KLEIDIAI_FETCH_ARGS DOWNLOAD_EXTRACT_TIMESTAMP NEW)

View File

@@ -6,17 +6,18 @@ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows(
const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
/*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i10 = blockIdx.x;
const int i11 = z / ne12; // TODO fastdiv
const int i12 = z % ne12;
const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv);
const int i11 = dm.x;
const int i12 = dm.y;
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
@@ -42,17 +43,18 @@ template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float(
const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
/*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i10 = blockIdx.x;
const int i11 = z / ne12; // TODO fastdiv
const int i12 = z % ne12;
const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv);
const int i11 = dm.x;
const int i12 = dm.y;
if (i00 >= ne00) {
return;
@@ -115,10 +117,14 @@ static void get_rows_cuda_q(
GGML_ASSERT(ne00 % 2 == 0);
GGML_ASSERT(ne12 > 0);
GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
const uint3 ne12_fdv = init_fastdiv_values(ne12);
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/
/*ne10,*/ ne11, ne12, /*ne13,*/
/*ne10,*/ ne11, ne12_fdv, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
@@ -146,10 +152,14 @@ static void get_rows_cuda_float(
const size_t s12 = nb12 / sizeof(int32_t);
// const size_t s13 = nb13 / sizeof(int32_t);
GGML_ASSERT(ne12 > 0);
GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
const uint3 ne12_fdv = init_fastdiv_values(ne12);
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/
/*ne10,*/ ne11, ne12, /*ne13,*/
/*ne10,*/ ne11, ne12_fdv, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);

View File

@@ -1646,7 +1646,11 @@ Listing all models in cache. The model metadata will also include a field to ind
}
```
Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
Note:
1. For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
- If a model is running but updated or removed from the source, it will be unloaded
- If a model is not running, it will be added or updated according to the source
The `status` object can be:

View File

@@ -243,9 +243,8 @@ void server_models::add_model(server_model_meta && meta) {
};
}
// TODO: allow refreshing cached model list
void server_models::load_models() {
// loading models from 3 sources:
// Phase 1: load presets from all sources — pure I/O, no lock needed
// 1. cached models
common_presets cached_models = ctx_preset.load_from_cache();
SRV_INF("Loaded %zu cached model presets\n", cached_models.size());
@@ -270,112 +269,266 @@ void server_models::load_models() {
// note: if a model exists in both cached and local, local takes precedence
common_presets final_presets;
for (const auto & [name, preset] : cached_models) {
final_presets[name] = preset;
}
for (const auto & [name, preset] : local_models) {
final_presets[name] = preset;
}
// process custom presets from INI
for (const auto & [name, preset] : cached_models) final_presets[name] = preset;
for (const auto & [name, preset] : local_models) final_presets[name] = preset;
for (const auto & [name, custom] : custom_presets) {
if (final_presets.find(name) != final_presets.end()) {
// apply custom config if exists
common_preset & target = final_presets[name];
target.merge(custom);
final_presets[name].merge(custom);
} else {
// otherwise add directly
final_presets[name] = custom;
}
}
// server base preset from CLI args take highest precedence
// server base preset from CLI args takes highest precedence
for (auto & [name, preset] : final_presets) {
preset.merge(base_preset);
}
// convert presets to server_model_meta and add to mapping
for (const auto & preset : final_presets) {
server_model_meta meta{
/* preset */ preset.second,
/* name */ preset.first,
/* aliases */ {},
/* tags */ {},
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
};
add_model(std::move(meta));
}
// log available models
{
std::unordered_set<std::string> custom_names;
for (const auto & [name, preset] : custom_presets) {
custom_names.insert(name);
// Helpers that read `mapping` — must be called while holding the lock.
std::unordered_set<std::string> custom_names;
for (const auto & [name, preset] : custom_presets) custom_names.insert(name);
auto join_set = [](const std::set<std::string> & s) {
std::string result;
for (const auto & v : s) {
if (!result.empty()) result += ", ";
result += v;
}
auto join_set = [](const std::set<std::string> & s) {
std::string result;
for (const auto & v : s) {
if (!result.empty()) {
result += ", ";
}
result += v;
}
return result;
};
return result;
};
auto log_available_models = [&]() {
SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
for (const auto & [name, inst] : mapping) {
bool has_custom = custom_names.find(name) != custom_names.end();
std::string info;
if (!inst.meta.aliases.empty()) {
info += " (aliases: " + join_set(inst.meta.aliases) + ")";
}
if (!inst.meta.tags.empty()) {
info += " [tags: " + join_set(inst.meta.tags) + "]";
}
if (!inst.meta.aliases.empty()) info += " (aliases: " + join_set(inst.meta.aliases) + ")";
if (!inst.meta.tags.empty()) info += " [tags: " + join_set(inst.meta.tags) + "]";
SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str());
}
}
// handle custom stop-timeout option
for (auto & [name, inst] : mapping) {
std::string val;
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
try {
inst.meta.stop_timeout = std::stoi(val);
} catch (...) {
SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
};
auto apply_stop_timeout = [&]() {
for (auto & [name, inst] : mapping) {
std::string val;
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
try {
inst.meta.stop_timeout = std::stoi(val);
} catch (...) {
SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
}
}
}
}
};
// update_args() injects HOST/PORT/ALIAS, so strip them before comparing presets
auto preset_options_for_compare = [](common_preset p) {
p.unset_option("LLAMA_ARG_HOST");
p.unset_option("LLAMA_ARG_PORT");
p.unset_option("LLAMA_ARG_ALIAS");
return p.options;
};
// load any autoload models
std::vector<std::string> models_to_load;
for (const auto & [name, inst] : mapping) {
std::string val;
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
if (common_arg_utils::is_truthy(val)) {
// Phase 2: acquire the lock once for all mapping mutations.
// We temporarily release it only when calling functions that acquire it internally
// (unload, load) or when joining threads (the monitoring thread calls update_status
// which locks the mutex, so joining while holding it would deadlock).
std::unique_lock<std::mutex> lk(mutex);
bool is_first_load = mapping.empty();
if (is_first_load) {
// FIRST LOAD: add all models, then unlock for autoloading
for (const auto & [name, preset] : final_presets) {
server_model_meta meta{
/* preset */ preset,
/* name */ name,
/* aliases */ {},
/* tags */ {},
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
};
add_model(std::move(meta));
}
apply_stop_timeout();
log_available_models();
std::vector<std::string> models_to_load;
for (const auto & [name, inst] : mapping) {
std::string val;
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val) && common_arg_utils::is_truthy(val)) {
models_to_load.push_back(name);
}
}
}
if ((int)models_to_load.size() > base_params.models_max) {
throw std::runtime_error(string_format(
"number of models to load on startup (%zu) exceeds models_max (%d)",
models_to_load.size(),
base_params.models_max
));
}
for (const auto & name : models_to_load) {
SRV_INF("(startup) loading model %s\n", name.c_str());
load(name);
if ((int)models_to_load.size() > base_params.models_max) {
throw std::runtime_error(string_format(
"number of models to load on startup (%zu) exceeds models_max (%d)",
models_to_load.size(), base_params.models_max));
}
lk.unlock();
for (const auto & name : models_to_load) {
SRV_INF("(startup) loading model %s\n", name.c_str());
load(name);
}
} else {
// RELOAD: diff the new preset list against the current mapping and reconcile
is_reloading = true;
// find running models whose source was removed or whose preset changed
std::vector<std::string> to_unload;
for (const auto & [name, inst] : mapping) {
if (!inst.meta.is_running()) continue;
auto it = final_presets.find(name);
if (it == final_presets.end()) {
to_unload.push_back(name); // removed from source
} else if (preset_options_for_compare(inst.meta.preset) != preset_options_for_compare(it->second)) {
to_unload.push_back(name); // preset changed
}
}
// unload() acquires the lock internally, so release before each call
for (const auto & name : to_unload) {
SRV_INF("(reload) unloading model name=%s (source updated or removed)\n", name.c_str());
lk.unlock();
unload(name);
lk.lock();
}
// wait for all targeted models to reach UNLOADED; cv.wait handles unlock/relock
cv.wait(lk, [&]() {
for (const auto & name : to_unload) {
auto it = mapping.find(name);
if (it != mapping.end() && it->second.meta.is_running()) return false;
}
return true;
});
// collect all threads to join in one pass while the lock is held:
// - monitoring threads from just-unloaded models (to_unload)
// - threads of already-UNLOADED models that are being removed from source
std::vector<std::thread> threads_to_join;
for (const auto & name : to_unload) {
auto it = mapping.find(name);
if (it != mapping.end() && it->second.th.joinable()) {
threads_to_join.push_back(std::move(it->second.th));
}
}
for (auto & [name, inst] : mapping) {
if (final_presets.find(name) == final_presets.end() && !inst.meta.is_running() && inst.th.joinable()) {
threads_to_join.push_back(std::move(inst.th));
}
}
// join outside the lock — monitoring thread calls update_status (needs lock)
lk.unlock();
for (auto & th : threads_to_join) th.join();
lk.lock();
// erase models no longer in any source
for (auto it = mapping.begin(); it != mapping.end(); ) {
if (final_presets.find(it->first) == final_presets.end()) {
SRV_INF("(reload) removing model name=%s (no longer in source)\n", it->first.c_str());
GGML_ASSERT(!it->second.th.joinable()); // must have been joined above
it = mapping.erase(it);
} else {
++it;
}
}
// update presets for non-running models still in source
for (auto & [name, inst] : mapping) {
if (inst.meta.is_running()) continue;
auto it = final_presets.find(name);
if (it == final_presets.end()) continue; // erased above
inst.meta.preset = it->second;
// re-parse aliases, then validate against other models
std::set<std::string> new_aliases;
std::string alias_str;
if (inst.meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) {
for (auto & alias : string_split<std::string>(alias_str, ',')) {
alias = string_strip(alias);
if (!alias.empty()) new_aliases.insert(alias);
}
}
inst.meta.aliases.clear();
for (const auto & alias : new_aliases) {
bool conflict = false;
for (const auto & [other_name, other_inst] : mapping) {
if (other_name == name) continue;
if (other_name == alias || other_inst.meta.aliases.count(alias)) {
SRV_WRN("(reload) alias '%s' for model '%s' conflicts with model '%s', skipping\n",
alias.c_str(), name.c_str(), other_name.c_str());
conflict = true;
break;
}
}
if (!conflict) inst.meta.aliases.insert(alias);
}
// re-parse tags
inst.meta.tags.clear();
std::string tags_str;
if (inst.meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) {
for (auto & tag : string_split<std::string>(tags_str, ',')) {
tag = string_strip(tag);
if (!tag.empty()) inst.meta.tags.insert(tag);
}
}
inst.meta.exit_code = 0; // clear failed state so the model can be reloaded
inst.meta.update_args(ctx_preset, bin_path);
}
// add models that are new in this reload
std::vector<std::string> newly_added;
for (const auto & [name, preset] : final_presets) {
if (mapping.find(name) == mapping.end()) {
server_model_meta meta{
/* preset */ preset,
/* name */ name,
/* aliases */ {},
/* tags */ {},
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
};
add_model(std::move(meta));
newly_added.push_back(name);
}
}
apply_stop_timeout();
// clear reload flag before unlocking for autoload — load() blocks on !is_reloading,
// so clearing it here (while still locked) prevents a deadlock in the autoload calls below
is_reloading = false;
cv.notify_all();
log_available_models();
// collect autoload candidates while still under the lock
std::vector<std::string> to_autoload;
for (const auto & name : newly_added) {
auto it = mapping.find(name);
if (it != mapping.end()) {
std::string val;
if (it->second.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val) && common_arg_utils::is_truthy(val)) {
to_autoload.push_back(name);
}
}
}
lk.unlock();
for (const auto & name : to_autoload) {
SRV_INF("(reload) loading new model %s\n", name.c_str());
load(name);
}
}
}
@@ -536,7 +689,10 @@ void server_models::load(const std::string & name) {
}
unload_lru();
std::lock_guard<std::mutex> lk(mutex);
std::unique_lock<std::mutex> lk(mutex);
// edge case: block until any in-progress reload has finished so we always load
// against the freshest preset and a consistent mapping state
cv.wait(lk, [this]() { return !is_reloading; });
auto meta = mapping[name].meta;
if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
@@ -993,7 +1149,11 @@ void server_models_routes::init_routes() {
return res;
};
this->get_router_models = [this](const server_http_req &) {
this->get_router_models = [this](const server_http_req & req) {
bool reload = !req.get_param("reload", "").empty();
if (reload) {
models.load_models();
}
auto res = std::make_unique<server_http_res>();
json models_json = json::array();
auto all_models = models.get_all_meta();

View File

@@ -100,6 +100,9 @@ private:
std::condition_variable cv_stop;
std::set<std::string> stopping_models;
// set to true while load_models() is executing a reload; load() will wait until clear
bool is_reloading = false;
common_preset_context ctx_preset;
common_params base_params;
@@ -118,6 +121,11 @@ private:
public:
server_models(const common_params & params, int argc, char ** argv);
// (re-)load the list of models from various sources and prepare the metadata mapping
// - if this is called the first time, simply populate the metadata
// - if this is called subsequently (e.g. when refreshing from disk):
// - if a model is running but updated or removed from the source, it will be unloaded
// - if a model is not running, it will be added or updated according to the source
void load_models();
// check if a model instance exists (thread-safe)

View File

@@ -62,6 +62,12 @@ def test_router_chat_completion_stream(model: str, success: bool):
assert content == ""
def _get_model_ids(is_reload: bool) -> set[str]:
res = server.make_request("GET", "/models" + ("?reload=1" if is_reload else ""))
assert res.status_code == 200
return {item["id"] for item in res.body.get("data", [])}
def _get_model_status(model_id: str) -> str:
res = server.make_request("GET", "/models")
assert res.status_code == 200
@@ -205,3 +211,45 @@ def test_router_api_key_required():
)
assert authed.status_code == 200
assert "error" not in authed.body
def test_router_reload_models():
"""POST /models/reload re-reads the INI preset and updates the model list."""
global server
preset_path = os.path.join(TMP_DIR, "test_reload.ini")
# Initial preset: two models
with open(preset_path, "w") as f:
f.write(
"[model-reload-a]\n"
"hf-repo = ggml-org/test-model-stories260K\n"
"\n"
"[model-reload-b]\n"
"hf-repo = ggml-org/test-model-stories260K-infill\n"
)
server.models_preset = preset_path
server.start()
ids = _get_model_ids(is_reload=False)
assert "model-reload-a" in ids
assert "model-reload-b" in ids
# Updated preset: remove a, keep b unchanged, add c
with open(preset_path, "w") as f:
f.write(
"[model-reload-b]\n"
"hf-repo = ggml-org/test-model-stories260K-infill\n"
"\n"
"[model-reload-c]\n"
"hf-repo = ggml-org/test-model-stories260K\n"
)
try:
ids = _get_model_ids(is_reload=True)
assert "model-reload-a" not in ids, "removed model should no longer appear"
assert "model-reload-b" in ids, "unchanged model should still appear"
assert "model-reload-c" in ids, "newly added model should appear"
finally:
os.remove(preset_path)

View File

@@ -5,6 +5,8 @@
import subprocess
import os
TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
import re
import json
from json import JSONDecodeError
@@ -86,6 +88,7 @@ class ServerProcess:
api_key: str | None = None
models_dir: str | None = None
models_max: int | None = None
models_preset: str | None = None
no_models_autoload: bool | None = None
lora_files: List[str] | None = None
enable_ctx_shift: int | None = False
@@ -156,6 +159,8 @@ class ServerProcess:
server_args.extend(["--models-dir", self.models_dir])
if self.models_max is not None:
server_args.extend(["--models-max", self.models_max])
if self.models_preset:
server_args.extend(["--models-preset", self.models_preset])
if self.n_batch:
server_args.extend(["--batch-size", self.n_batch])
if self.n_ubatch: