Compare commits

..

10 Commits

Author SHA1 Message Date
Adrien Gallouët
84de01a1f1 llama : use LLM_KV for quantization_version & file_type (#24802)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-06-20 20:07:01 +02:00
Xuan-Son Nguyen
75f460ac28 arg: try fixing test-args-parser randomly fails (#24826)
* arg: try fixing test-args-parser randomly fails

* return ref

* try triggering the workflow

* exception wrapper

* wip

* test

* test 2

* arg: guard win32 utf8 argv override

make_utf8_argv rebuilds argv from GetCommandLineW to fix utf8 handling of
non ascii arguments on windows. the override runs unconditionally inside
common_params_parse, so it also clobbers a programmatic argv passed by a
caller. test-arg-parser builds a synthetic argv but then sees the real
process command line instead, the model argument is never parsed, and the
assert that expects success aborts via fastfail (0xC0000409). this shows up
as a random failure in the openvino windows workflow.

only override argv when its length matches the caller argc, so the utf8
repair still applies to real binaries while a programmatic argv stays intact.

---------

Co-authored-by: Pascal <admin@serveurperso.com>
2026-06-20 19:45:27 +02:00
Muhammad Salem
8452824611 release: add missing link for win opencl adreno arm64 (#24809) 2026-06-20 23:08:59 +08:00
Matti4
e27f308597 server: avoid forwarding auth headers in CORS proxy (#24373)
* server: avoid forwarding auth headers in CORS proxy

* format

* fix test

* fix e2e test

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2026-06-20 15:34:47 +02:00
Aldehir Rojas
67e9fd3b74 docker : prebuild web UI for s390x build [no release] (#24829) 2026-06-20 05:54:42 -05:00
davidrhodus
796f41bedc model : glm-dsa load DSA indexer tensors as optional (#24770)
GLM-5.2 ships the DSA "lightning indexer" on only a subset of layers (the
"full" layers; others omit it), but the GLM_DSA loader created the five
indexer tensors on every layer as required, so loading any GLM-5.2 GGUF
failed with e.g. `missing tensor 'blk.3.indexer.k_norm.weight'`.

GLM_DSA's graph is llama_model_deepseek2::graph (plain MLA) and does not use
the indexer tensors (indexer runtime not yet implemented), so they are
loaded-but-unused. Marking them TENSOR_NOT_REQUIRED lets layers without an
indexer load as nullptr and the model runs as full MLA attention.

DeepSeek-V3.2 (uniform indexer on all layers) is unaffected.
2026-06-20 13:48:24 +03:00
Adrien Gallouët
37a77fb057 ggml : optimize AMX (#24806)
Flatten the partition over n_batch * M so every thread participates in
the quantization

    | CPU                             | Model                         | Test   |   t/s OLD |   t/s NEW |   Speedup |
    |:--------------------------------|:------------------------------|:-------|----------:|----------:|----------:|
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_NL - 4.5 bpw  | pp512  |    730.71 |    779.86 |      1.07 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_NL - 4.5 bpw  | tg128  |     87.88 |     86.79 |      0.99 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_XS - 4.25 bpw | pp512  |    725.09 |   1023.31 |      1.41 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_XS - 4.25 bpw | tg128  |     83.64 |     83.62 |      1.00 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_0              | pp512  |    820.51 |    924.05 |      1.13 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_0              | tg128  |     90.59 |     92.46 |      1.02 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_1              | pp512  |    776.88 |    872.79 |      1.12 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_1              | tg128  |     89.39 |     90.94 |      1.02 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_M            | pp512  |    719.28 |   1009.27 |      1.40 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_M            | tg128  |     80.62 |     80.86 |      1.00 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_S            | pp512  |    732.29 |   1077.29 |      1.47 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_S            | tg128  |     86.42 |     83.53 |      0.97 |

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-06-20 13:43:06 +03:00
Sigbjørn Skjæret
f4043fec01 convert : more consistent handling of rope_parameters (#24833) 2026-06-20 13:42:36 +03:00
Masashi Yoshimura
f449e05537 ggml-webgpu: add adapter toggles for F16 on Vulkan + NVIDIA 2026-06-20 08:12:32 +09:00
Xuan-Son Nguyen
2b686a9120 server: refactor child --> router communication (#24821)
* server: refactor child --> router communication

* fix wakeup case

* add docs

* improve update_status()

* nits
2026-06-20 01:02:26 +02:00
41 changed files with 459 additions and 213 deletions

View File

@@ -4,20 +4,6 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
### Build Llama.cpp stage
FROM docker.io/gcc:${GCC_VERSION} AS build
@@ -34,8 +20,6 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN --mount=type=cache,target=/root/.ccache \
--mount=type=cache,target=/app/build \
cmake -S . -B build -G Ninja \

View File

@@ -11,7 +11,6 @@
build*/
tools/ui/node_modules/
tools/ui/dist/
models/*

View File

@@ -58,6 +58,13 @@ jobs:
git tag ${{ steps.srctag.outputs.name }} || exit 0
git push origin ${{ steps.srctag.outputs.name }} || exit 0
build_ui:
name: Build UI
needs: create_tag
uses: ./.github/workflows/ui-build.yml
with:
hf_ui_version: ${{ needs.create_tag.outputs.source_tag }}
prepare_matrices:
name: Prepare Docker matrices
runs-on: ubuntu-24.04
@@ -79,7 +86,7 @@ jobs:
[
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x", "prebuilt_ui": true },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
@@ -135,7 +142,7 @@ jobs:
push_to_registry:
name: Push Docker image to Docker Registry
needs: [prepare_matrices, create_tag]
needs: [prepare_matrices, create_tag, build_ui]
runs-on: ${{ matrix.config.runs_on }}
strategy:
@@ -150,6 +157,13 @@ jobs:
fetch-depth: 0
ref: ${{ needs.create_tag.outputs.source_tag }}
- name: Download prebuilt UI
if: ${{ matrix.config.prebuilt_ui == true }}
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
with:
name: ui-build
path: tools/ui/dist
- name: Set up QEMU
if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4

View File

@@ -1627,6 +1627,7 @@ jobs:
**Windows:**
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
- [Windows arm64 (OpenCL Adreno)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-opencl-adreno-arm64.zip)
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)

View File

@@ -303,7 +303,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
if (!model.docker_repo.empty()) {
model.path = common_docker_resolve_model(model.docker_repo);
model.name = model.docker_repo;
} else if (!model.hf_repo.empty()) {
// If -m was used with -hf, treat the model "path" as the hf_file to download
if (model.hf_file.empty() && !model.path.empty()) {
@@ -323,7 +322,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
throw std::runtime_error("failed to download model from Hugging Face");
}
model.name = model.hf_repo;
model.path = download_result.model_path;
if (!download_result.mmproj_path.empty()) {
@@ -926,8 +924,8 @@ static utf8_argv make_utf8_argv() {
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
#ifdef _WIN32
auto utf8 = make_utf8_argv();
if (!utf8.ptrs.empty()) {
argc = static_cast<int>(utf8.buf.size());
// repair argv only when it matches the process command line
if (static_cast<int>(utf8.buf.size()) == argc) {
argv = utf8.ptrs.data();
}
#endif
@@ -2899,7 +2897,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.server_tools = parse_csv_row(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
add_opt(common_arg(
add_opt(common_arg(
{"-ag", "--agent"},
{"-no-ag", "--no-agent"},
"whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",

View File

@@ -295,7 +295,16 @@ struct common_params_model {
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string docker_repo = ""; // Docker repo // NOLINT
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
std::string get_name() {
if (!hf_repo.empty()) {
return hf_repo;
}
if (!docker_repo.empty()) {
return docker_repo;
}
return path;
}
};
// draft-model-based speculative decoding parameters

View File

@@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
if (rope_dim := hparams.get("head_dim")) is None:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])

View File

@@ -1119,8 +1119,10 @@ class TextModel(ModelBase):
rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
# Ensure global params are mirrored in rope_parameters
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
if local_rope_theta is not None:
self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
@@ -1128,6 +1130,10 @@ class TextModel(ModelBase):
self.rope_parameters["rope_theta"] = rope_theta
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
self.rope_parameters["rope_type"] = rope_type
if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings
@classmethod
def __init_subclass__(cls):

View File

@@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
rope_dim = self.hparams["attention_dim"]
else:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_add_bos_token(False)
rope_freq = 10000
if "rope_ratio" in self.hparams:

View File

@@ -161,7 +161,7 @@ class DeciModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor

View File

@@ -24,7 +24,7 @@ class ExaoneModel(TextModel):
assert (hparams["activation_function"] == "silu")
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
rotary_factor = self.rope_parameters.get("partial_rotary_factor")
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
@@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
@@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
factor = rope_params.get("factor", 16.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor

View File

@@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
self.gguf_writer.add_head_count_kv(value_arr)
# handle n_rot differently for global vs swa layers
partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
self.gguf_writer.add_rope_dimension_count(n_rot_full)

View File

@@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
)
self.gguf_writer.add_rope_dimension_count(
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
)
# MoE parameters - Use only routed expert count (shared experts handled separately)
@@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
super().set_gguf_parameters()
rope_dim = self.hparams["qk_rope_head_dim"]
partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))
# NextN/MTP prediction layers

View File

@@ -289,7 +289,7 @@ class LlamaModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor

View File

@@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))

View File

@@ -32,11 +32,9 @@ class MiniCPMModel(TextModel):
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if long_factors or short_factors:
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
@@ -85,13 +83,11 @@ class MiniCPM3Model(TextModel):
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if long_factors or short_factors:
rope_dims = self.hparams["qk_rope_head_dim"]
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

View File

@@ -125,17 +125,18 @@ class NemotronModel(TextModel):
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
# * Partial RoPE
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
rot_pct = self.rope_parameters["partial_rotary_factor"]
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
# * RopeScaling for Nemotron
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
if factor is None:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
else:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
self.gguf_writer.add_rope_scaling_factor(factor)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side

View File

@@ -18,7 +18,7 @@ class Phi2Model(TextModel):
model_arch = gguf.MODEL_ARCH.PHI2
def set_gguf_parameters(self):
rot_pct = self.find_hparam(["partial_rotary_factor"])
rot_pct = self.rope_parameters["partial_rotary_factor"]
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
@@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
rms_eps = self.find_hparam(["rms_norm_eps"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
rope_dims = int(rot_pct * n_embd) // n_head
self.gguf_writer.add_context_length(max_pos_embds)
@@ -174,18 +174,19 @@ class Phi3MiniModel(TextModel):
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
rope_dims = int(rot_pct * n_embd) // n_head
# write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is None:
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if not long_factors:
return
scale = max_pos_embds / orig_max_pos_embds
rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
if len(rope_scaling_type) == 0:
raise KeyError('Missing the required key rope_scaling.type')
@@ -198,9 +199,6 @@ class Phi3MiniModel(TextModel):
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

View File

@@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
if (rope_dim := self.hparams.get("head_dim")) is None:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:

View File

@@ -28,7 +28,7 @@ class StableLMModel(TextModel):
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
rotary_factor = self.rope_parameters["partial_rotary_factor"]
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])

View File

@@ -314,7 +314,7 @@ class Step35Model(TextModel):
factor = float(rope_params.get("factor", 8.0))
low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor

View File

@@ -2417,15 +2417,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
parallel_for_ggml(params, n_batch, [&](int begin, int end) {
for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
parallel_for_ggml(params, n_batch * M, [&](int begin, int end) {
for (int idx = begin; idx < end; ++idx) {
int batch_idx = idx / M;
int m = idx % M;
int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
const float * A_data = (const float *)((const char *)src1->data + src1_offset);
char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
for (int m = 0; m < M; ++m) {
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
}
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
}
});
});

View File

@@ -3788,7 +3788,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
}
static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
static void ggml_backend_webgpu_request_adapter(wgpu::Instance & instance, wgpu::Adapter & adapter) {
wgpu::RequestAdapterOptions options = {};
#ifndef __EMSCRIPTEN__
@@ -3800,17 +3800,20 @@ static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
options.nextInChain = &adapterTogglesDesc;
#endif
ctx->webgpu_global_ctx->instance.WaitAny(
ctx->webgpu_global_ctx->instance.RequestAdapter(
&options, wgpu::CallbackMode::AllowSpontaneous,
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
if (status != wgpu::RequestAdapterStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
return;
}
ctx->webgpu_global_ctx->adapter = std::move(adapter);
}),
UINT64_MAX);
instance.WaitAny(instance.RequestAdapter(
&options, wgpu::CallbackMode::AllowSpontaneous,
[&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
if (status != wgpu::RequestAdapterStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
return;
}
adapter = std::move(_adapter);
}),
UINT64_MAX);
}
static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, ctx->webgpu_global_ctx->adapter);
GGML_ASSERT(ctx->webgpu_global_ctx->adapter != nullptr);
ctx->webgpu_global_ctx->adapter.GetLimits(&ctx->webgpu_global_ctx->capabilities.limits);
@@ -4543,20 +4546,7 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
// Probe for adapter support
wgpu::Adapter adapter;
if (ctx->webgpu_global_ctx->instance != nullptr) {
wgpu::RequestAdapterOptions options = {};
// probe for adapter support
ctx->webgpu_global_ctx->instance.WaitAny(
ctx->webgpu_global_ctx->instance.RequestAdapter(
&options, wgpu::CallbackMode::AllowSpontaneous,
[&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
if (status != wgpu::RequestAdapterStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
return;
}
adapter = std::move(_adapter);
}),
UINT64_MAX);
ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, adapter);
}
// WebGPU backend requires f16 support and, on native, implicit device synchronization.

View File

@@ -932,8 +932,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// copy the KV pairs from the input file
gguf_set_kv (ctx_out.get(), ml.metadata);
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION).c_str(), GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_FILE_TYPE).c_str(), ftype);
// Remove split metadata
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());

View File

@@ -101,11 +101,11 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
// DSA indexer
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags);
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags);
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags);
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags);
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
if (i < (int) hparams.n_layer_dense_lead) {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);

View File

@@ -10,7 +10,7 @@
#undef NDEBUG
#include <cassert>
int main(void) {
static void test(void) {
common_params params;
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
@@ -210,3 +210,13 @@ int main(void) {
printf("test-arg-parser: all tests OK\n\n");
}
int main(void) {
try {
test();
} catch (std::exception & e) {
fprintf(stderr, "test-arg-parser: exception: %s\n", e.what());
return 1;
}
return 0;
}

View File

@@ -180,6 +180,17 @@ That requires `JSON.stringify` when formatted to message content:
}
```
### Router mode: how child <--> router communicates
Upon spawning a new child process using `subprocess`, both child and router listen to the stdout/stderr (combined)
For the direction from child to router:
- Generic messages are logs, it will be forwarded to router's stdout
- Special state update messages are prefixed by `cmd_child_to_router:state:`, followed by a JSON. See `server_models::handle_child_state` for more
For the direction from router to child:
- When server sends `cmd_router_to_child:exit`, the child should exit gracefully --> if after `DEFAULT_STOP_TIMEOUT` and the child is still running, force-kill it
### Model management API (router mode)
Model management API was added via PR [#23976](https://github.com/ggml-org/llama.cpp/pull/23976)

View File

@@ -63,11 +63,6 @@ enum slot_state {
SLOT_STATE_GENERATING,
};
enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded
};
struct server_slot {
int id;
@@ -773,6 +768,8 @@ public:
// note: chat_params must not be refreshed upon existing sleeping state
server_chat_params chat_params;
server_state_callback_t callback_state = [](server_state, json) -> void {};
server_context_impl() {
mtmd_helper_log_set(common_log_default_callback, nullptr);
}
@@ -1244,8 +1241,8 @@ private:
if (!params_base.model_alias.empty()) {
// backward compat: use first alias as model name
model_name = *params_base.model_alias.begin();
} else if (!params_base.model.name.empty()) {
model_name = params_base.model.name;
} else if (!params_base.model.get_name().empty()) {
model_name = params_base.model.get_name();
} else {
// fallback: derive model name from file name
auto model_path = std::filesystem::path(params_base.model.path);
@@ -3734,8 +3731,11 @@ struct server_res_generator : server_http_res {
}
};
void server_context::on_sleeping_changed(std::function<void(bool)> callback) {
impl->queue_tasks.on_sleeping_state(std::move(callback));
void server_context::set_state_callback(server_state_callback_t callback) {
impl->callback_state = std::move(callback);
impl->queue_tasks.on_sleeping_state([this](bool sleeping) {
impl->callback_state(sleeping ? SERVER_STATE_SLEEPING : SERVER_STATE_READY, {});
});
}
// compute the number of tokens before the last user message in the prompt

View File

@@ -52,6 +52,31 @@ struct server_context_meta {
uint64_t model_size;
};
enum server_state {
// SERVER_STATE_DOWNLOADING,
SERVER_STATE_LOADING,
SERVER_STATE_READY,
SERVER_STATE_SLEEPING,
};
static std::string server_state_to_str(server_state state) {
switch (state) {
case SERVER_STATE_LOADING: return "loading";
case SERVER_STATE_READY: return "ready";
case SERVER_STATE_SLEEPING: return "sleeping";
default: GGML_ASSERT(false && "invalid server_state");
}
}
static server_state server_state_from_str(const std::string & str) {
if (str == "loading") return SERVER_STATE_LOADING;
if (str == "ready") return SERVER_STATE_READY;
if (str == "sleeping") return SERVER_STATE_SLEEPING;
GGML_ASSERT(false && "invalid server_state string");
}
using server_state_callback_t = std::function<void(server_state, json /* payload */)>;
struct server_context {
std::unique_ptr<server_context_impl> impl;
@@ -79,9 +104,8 @@ struct server_context {
// not thread-safe, should only be used from the main thread
server_context_meta get_meta() const;
// register a callback to be called when sleeping state changes
// must be set before load_model() is called
void on_sleeping_changed(std::function<void(bool)> callback);
// note: must be set before load_model() is called
void set_state_callback(server_state_callback_t callback);
};

View File

@@ -7,9 +7,18 @@
#include <unordered_set>
#include <list>
#include <map>
#include <algorithm>
#include <cctype>
#include "server-http.h"
static std::string proxy_header_to_lower(std::string header) {
std::transform(header.begin(), header.end(), header.begin(), [](unsigned char c) {
return std::tolower(c);
});
return header;
}
static server_http_res_ptr proxy_request(const server_http_req & req, std::string method) {
std::string target_url = req.get_param("url");
common_http_url parsed_url = common_http_parse_url(target_url);
@@ -33,11 +42,18 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
std::map<std::string, std::string> headers;
const std::string proxy_header_prefix = "x-llama-server-proxy-header-";
for (auto [key, value] : req.headers) {
auto new_key = key;
if (string_starts_with(new_key, "x-proxy-header-")) {
string_replace_all(new_key, "x-proxy-header-", "");
const std::string lowered_key = proxy_header_to_lower(key);
if (!string_starts_with(lowered_key, proxy_header_prefix)) {
continue;
}
auto new_key = key.substr(proxy_header_prefix.size());
if (new_key.empty()) {
continue;
}
headers[new_key] = value;
}

View File

@@ -1,5 +1,6 @@
#include "server-common.h"
#include "server-models.h"
#include "server-context.h"
#include "build-info.h"
#include "preset.h"
@@ -44,9 +45,7 @@ extern char **environ;
#define DEFAULT_STOP_TIMEOUT 10 // seconds
#define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit"
#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
#define CMD_CHILD_TO_ROUTER_INFO "cmd_child_to_router:info:" // followed by json string
#define CMD_CHILD_TO_ROUTER_STATE "cmd_child_to_router:state:" // followed by json string
// address for child process, this is needed because router may run on 0.0.0.0
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
@@ -904,12 +903,8 @@ void server_models::load(const std::string & name) {
while (fgets(buffer, vec_buf.size(), stdout_file) != nullptr) {
LOG("[%5d] %s", port, buffer);
std::string str(buffer);
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
this->update_loaded_info(name, str);
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_STATE)) {
this->handle_child_state(name, str);
}
}
} else {
@@ -976,7 +971,10 @@ void server_models::load(const std::string & name) {
subprocess_destroy(&child_proc->get());
// update status and exit code
this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code);
this->update_status(name, {
SERVER_MODEL_STATUS_UNLOADED,
exit_code
});
SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
});
@@ -1016,7 +1014,8 @@ struct server_models_download_res : public common_download_callback {
common_download_model(model, opts);
is_ok = true;
} catch (const std::exception & e) {
SRV_ERR("download failed for model name=%s: %s\n", model.name.c_str(), e.what());
auto model_name = model.get_name();
SRV_ERR("download failed for model name=%s: %s\n", model_name.c_str(), e.what());
is_ok = false;
}
return is_ok;
@@ -1036,7 +1035,7 @@ struct server_models_download_res : public common_download_callback {
};
void server_models::download(common_params_model && model, common_download_opts && opts) {
std::string name = model.name;
std::string name = model.get_name();
GGML_ASSERT(name == model.hf_repo);
std::unique_lock<std::mutex> lk(mutex);
@@ -1064,9 +1063,10 @@ void server_models::download(common_params_model && model, common_download_opts
inst.th = std::thread([this, dl = std::move(dl)]() {
dl->opts.callback = dl.get();
bool ok = dl->run();
auto model_name = dl->model.get_name();
SRV_INF("download finished for model name=%s with status=%s\n",
dl->model.name.c_str(), ok ? "success" : "failure");
update_download_progress(dl->model.name, {}, true, ok);
model_name.c_str(), ok ? "success" : "failure");
update_download_progress(model_name, {}, true, ok);
// need_reload is set inside update_download_progress under the mutex;
// the next load_models() call will clean up this instance
});
@@ -1130,21 +1130,27 @@ void server_models::unload_all() {
}
}
void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
void server_models::update_status(const std::string & name, const update_status_args & args) {
std::unique_lock<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
auto & meta = it->second.meta;
meta.status = status;
meta.exit_code = exit_code;
meta.status = args.status;
meta.exit_code = args.exit_code;
if (!args.loaded_info.is_null()) {
meta.loaded_info = args.loaded_info;
}
}
// broadcast status change to SSE
{
json data = {
{"status", server_model_status_to_string(status)},
{"status", server_model_status_to_string(args.status)},
};
if (status == SERVER_MODEL_STATUS_UNLOADED) {
data["exit_code"] = exit_code;
if (args.status == SERVER_MODEL_STATUS_UNLOADED) {
data["exit_code"] = args.exit_code;
}
if (!args.loaded_info.is_null()) {
data["info"] = args.loaded_info;
}
// note: notify_sse doesn't acquire the lock, so no deadlock here
notify_sse("status_change", name, data);
@@ -1152,29 +1158,6 @@ void server_models::update_status(const std::string & name, server_model_status
cv.notify_all();
}
void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
return;
}
json info;
try {
info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
} catch (const std::exception & e) {
SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
return;
}
std::unique_lock<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
auto & meta = it->second.meta;
meta.loaded_info = info;
}
cv.notify_all();
}
void server_models::update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok) {
json curr;
{
@@ -1323,21 +1306,54 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
return proxy;
}
bool server_models::is_child_server() {
void server_models::handle_child_state(const std::string & name, const std::string & raw_input) {
server_state state;
json payload;
try {
json data = json::parse(raw_input.substr(strlen(CMD_CHILD_TO_ROUTER_STATE)));
state = server_state_from_str(json_value(data, "state", std::string()));
payload = json_value(data, "payload", json{});
} catch (const std::exception & e) {
SRV_ERR("failed to parse child state update for name=%s: %s\n", name.c_str(), e.what());
return;
}
switch (state) {
case SERVER_STATE_LOADING:
{
// do nothing for now
// TODO: report loading progress for first load and wakeup from sleep
} break;
case SERVER_STATE_READY:
{
update_status(name, {
SERVER_MODEL_STATUS_LOADED,
0,
// note: payload can be empty if this is a wakeup from sleep
payload.size() > 0 ? payload : nullptr
});
} break;
case SERVER_STATE_SLEEPING:
{
update_status(name, { SERVER_MODEL_STATUS_SLEEPING });
} break;
default:
// should never happen, but just in case
GGML_ASSERT(false && "unexpected state from child server");
}
}
//
// server_child
//
bool server_child::is_child() {
const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
return router_port != nullptr;
}
std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
// send a notification to the router server that a model instance is ready
common_log_pause(common_log_main());
fflush(stdout);
fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
fflush(stdout);
fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
fflush(stdout);
common_log_resume(common_log_main());
std::thread server_child::setup(const std::function<void(int)> & shutdown_handler) {
// setup thread for monitoring stdin
return std::thread([shutdown_handler]() {
// wait for EOF on stdin
@@ -1363,10 +1379,14 @@ std::thread server_models::setup_child_server(const std::function<void(int)> & s
});
}
void server_models::notify_router_sleeping_state(bool is_sleeping) {
void server_child::notify_to_router(const std::string & state, const json & payload) {
json data = {
{"state", state},
{"payload", payload},
};
common_log_pause(common_log_main());
fflush(stdout);
fprintf(stdout, "%s\n", is_sleeping ? CMD_CHILD_TO_ROUTER_SLEEP : CMD_CHILD_TO_ROUTER_READY);
fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_STATE, safe_json_to_str(data).c_str());
fflush(stdout);
common_log_resume(common_log_main());
}
@@ -1644,7 +1664,6 @@ void server_models_routes::init_routes() {
common_params_model model;
common_download_opts opts;
model.name = name;
model.hf_repo = name;
opts.bearer_token = params.hf_token;
opts.download_mmproj = true;

View File

@@ -171,8 +171,12 @@ public:
void download(common_params_model && model, common_download_opts && opts);
// update the status of a model instance (thread-safe)
void update_status(const std::string & name, server_model_status status, int exit_code);
void update_loaded_info(const std::string & name, std::string & raw_info);
struct update_status_args {
server_model_status status;
int exit_code = 0; // only valid if status == UNLOADED
json loaded_info = nullptr;
};
void update_status(const std::string & name, const update_status_args & args);
void update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok = true);
// remove a cache model from disk and update the list (thread-safe)
@@ -193,15 +197,27 @@ public:
// proxy an HTTP request to the model instance
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
// handle message sent from server_child::notify_to_router()
// raw input must starts with CMD_CHILD_TO_ROUTER_STATE, followed by a JSON string
// this function is not thread-safe, must be called from instance's monitoring thread
// payload per state:
// state = loading -> payload = {} (TODO: add progress info)
// state = ready -> payload = model_info (json), or {} if wakeup from sleeping
// state = sleeping -> payload = {}
void handle_child_state(const std::string & name, const std::string & raw_input);
};
struct server_child {
// return true if the current process is a child server instance
static bool is_child_server();
bool is_child();
// notify the router server that a model instance is ready
// register the shutdown_handler to be called by the router
// return the monitoring thread (to be joined by the caller)
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);
std::thread setup(const std::function<void(int)> & shutdown_handler);
// notify the router server that the sleeping state has changed
static void notify_router_sleeping_state(bool sleeping);
// notify router server for status changes (e.g. loading, downloading, sleeping, etc.)
// message will be handled by server_models::handle_child_state() on the router side
void notify_to_router(const std::string & state_name, const json & payload);
};
struct server_models_routes {

View File

@@ -90,8 +90,10 @@ int llama_server(int argc, char ** argv) {
llama_numa_init(params.numa);
// router server never loads a model and must not touch the GPU
const bool is_router_server = params.model.path.empty()
&& params.model.hf_repo.empty();
// skip device enumeration so the CUDA primary context stays uncreated
const bool is_router_server = params.model.path.empty();
common_params_print_info(params, !is_router_server);
if (!is_router_server) {
@@ -113,8 +115,9 @@ int llama_server(int argc, char ** argv) {
}
// for consistency between server router mode and single-model mode, we set the same model name as alias
if (params.model_alias.empty() && !params.model.name.empty()) {
params.model_alias.insert(params.model.name);
auto model_name = params.model.get_name();
if (params.model_alias.empty() && !model_name.empty()) {
params.model_alias.insert(model_name);
}
// struct that contains llama context and inference
@@ -255,6 +258,7 @@ int llama_server(int argc, char ** argv) {
// Start the server
//
server_child child; // only used in non-router mode
std::function<void()> clean_up;
if (is_router_server) {
@@ -300,15 +304,16 @@ int llama_server(int argc, char ** argv) {
return 1;
}
// load the model
SRV_INF("%s", "loading model\n");
if (server_models::is_child_server()) {
ctx_server.on_sleeping_changed([&](bool sleeping) {
server_models::notify_router_sleeping_state(sleeping);
// setup communication child --> router if necessary
if (child.is_child()) {
ctx_server.set_state_callback([&](server_state state, json payload) {
child.notify_to_router(server_state_to_str(state), payload);
});
}
// load the model
SRV_INF("%s", "loading model\n");
if (!ctx_server.load_model(params)) {
clean_up();
if (ctx_http.thread.joinable()) {
@@ -365,9 +370,9 @@ int llama_server(int argc, char ** argv) {
// optionally, notify router server that this instance is ready
std::thread monitor_thread;
if (server_models::is_child_server()) {
json model_info = routes.get_model_info();
monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
if (child.is_child()) {
monitor_thread = child.setup(shutdown_handler);
child.notify_to_router(server_state_to_str(SERVER_STATE_READY), routes.get_model_info());
}
// this call blocks the main thread until queue_tasks.terminate() is called

View File

@@ -1,6 +1,8 @@
import pytest
from openai import OpenAI
from utils import *
import threading
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
server = ServerPreset.tinyllama2()
@@ -105,6 +107,49 @@ def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
assert res.headers[cors_header] == cors_header_value
def test_cors_proxy_only_forwards_explicit_proxy_headers():
class CaptureHeadersHandler(BaseHTTPRequestHandler):
def do_GET(self):
self.server.captured_headers = dict(self.headers)
self.send_response(200)
self.end_headers()
self.wfile.write(b"ok")
def log_message(self, format, *args):
pass
target = ThreadingHTTPServer(("127.0.0.1", 0), CaptureHeadersHandler)
target.captured_headers = {}
target_thread = threading.Thread(target=target.serve_forever, daemon=True)
target_thread.start()
try:
server = ServerPreset.tinyllama2()
server.api_key = TEST_API_KEY
server.ui_mcp_proxy = True
server.start()
res = server.make_request("GET", f"/cors-proxy?url=http://127.0.0.1:{target.server_port}/capture", headers={
"Authorization": f"Bearer {TEST_API_KEY}",
"Proxy-Authorization": "Basic secret",
"X-Api-Key": TEST_API_KEY,
"Cookie": "session=secret",
"x-llama-server-proxy-header-accept": "application/json",
"x-llama-server-proxy-header-authorization": "Bearer explicit",
})
assert res.status_code == 200
captured = {key.lower(): value for key, value in target.captured_headers.items()}
assert captured["accept"] == "application/json"
assert captured["authorization"] == "Bearer explicit"
assert "proxy-authorization" not in captured
assert "x-api-key" not in captured
assert "cookie" not in captured
finally:
target.shutdown()
target.server_close()
@pytest.mark.parametrize(
"media_path, image_url, success",
[

View File

@@ -51,6 +51,9 @@ export const EXPECTED_THEMED_ICON_PAIR_COUNT = 2;
/** CORS proxy URL query parameter name */
export const CORS_PROXY_URL_PARAM = 'url';
/** Header prefix for headers that should be forwarded by the CORS proxy */
export const CORS_PROXY_HEADER_PREFIX = 'x-llama-server-proxy-header-';
/** Number of trailing characters to keep visible when partially redacting mcp-session-id */
export const MCP_SESSION_ID_VISIBLE_CHARS = 5;

View File

@@ -16,6 +16,7 @@ import {
DEFAULT_MCP_CONFIG,
DEFAULT_CLIENT_VERSION,
DEFAULT_IMAGE_MIME_TYPE,
CORS_PROXY_HEADER_PREFIX,
MCP_PARTIAL_REDACT_HEADERS,
CORS_PROXY_ENDPOINT
} from '$lib/constants';
@@ -133,6 +134,20 @@ export class MCPService {
return details;
}
private static addRequestHeaders(
requestHeaders: Headers,
headers: HeadersInit,
useProxy: boolean
) {
for (const [key, value] of new Headers(headers).entries()) {
const proxiedKey =
useProxy && !key.toLowerCase().startsWith(CORS_PROXY_HEADER_PREFIX)
? `${CORS_PROXY_HEADER_PREFIX}${key}`
: key;
requestHeaders.set(proxiedKey, value);
}
}
private static summarizeError(error: unknown): Record<string, unknown> {
if (error instanceof Error) {
return {
@@ -271,15 +286,11 @@ export class MCPService {
const requestHeaders = new Headers(baseInit.headers);
if (typeof Request !== 'undefined' && input instanceof Request) {
for (const [key, value] of input.headers.entries()) {
requestHeaders.set(key, value);
}
this.addRequestHeaders(requestHeaders, input.headers, useProxy);
}
if (init?.headers) {
for (const [key, value] of new Headers(init.headers).entries()) {
requestHeaders.set(key, value);
}
this.addRequestHeaders(requestHeaders, init.headers, useProxy);
}
const request = this.createDiagnosticRequestDetails(

View File

@@ -1,5 +1,5 @@
import { config } from '$lib/stores/settings.svelte';
import { REDACTED_HEADERS } from '$lib/constants';
import { CORS_PROXY_HEADER_PREFIX, REDACTED_HEADERS } from '$lib/constants';
import { redactValue } from './redact';
/**
@@ -52,11 +52,20 @@ export function sanitizeHeaders(
for (const [key, value] of normalized.entries()) {
const normalizedKey = key.toLowerCase();
const partialChars = partialRedactHeaders?.get(normalizedKey);
const unproxiedKey = normalizedKey.startsWith(CORS_PROXY_HEADER_PREFIX)
? normalizedKey.slice(CORS_PROXY_HEADER_PREFIX.length)
: normalizedKey;
const partialChars =
partialRedactHeaders?.get(normalizedKey) ?? partialRedactHeaders?.get(unproxiedKey);
if (partialChars !== undefined) {
sanitized[key] = redactValue(value, partialChars);
} else if (REDACTED_HEADERS.has(normalizedKey) || redactedHeaders.has(normalizedKey)) {
} else if (
REDACTED_HEADERS.has(normalizedKey) ||
REDACTED_HEADERS.has(unproxiedKey) ||
redactedHeaders.has(normalizedKey) ||
redactedHeaders.has(unproxiedKey)
) {
sanitized[key] = redactValue(value);
} else {
sanitized[key] = value;

View File

@@ -3,7 +3,11 @@
*/
import { base } from '$app/paths';
import { CORS_PROXY_ENDPOINT, CORS_PROXY_URL_PARAM } from '$lib/constants';
import {
CORS_PROXY_ENDPOINT,
CORS_PROXY_HEADER_PREFIX,
CORS_PROXY_URL_PARAM
} from '$lib/constants';
/**
* Build a proxied URL that routes through llama-server's CORS proxy.
@@ -28,7 +32,7 @@ export function buildProxiedHeaders(headers: Record<string, string>): Record<str
const proxiedHeaders: Record<string, string> = {};
for (const [key, value] of Object.entries(headers)) {
proxiedHeaders[`x-proxy-header-${key}`] = value;
proxiedHeaders[`${CORS_PROXY_HEADER_PREFIX}${key}`] = value;
}
return proxiedHeaders;

View File

@@ -39,8 +39,8 @@ test.describe('PWA Service Worker', () => {
const swContent = await swResponse.text();
// Precache contains SvelteKit content-hashed bundle paths
expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"/);
expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9-]+\.css"/);
expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/);
expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/);
expect(swContent).toMatch(/"manifest\.webmanifest"/);
expect(swContent).toMatch(/"_app\/version\.json"/);
expect(swContent).toMatch(/NavigationRoute/);
@@ -99,8 +99,8 @@ test.describe('PWA Service Worker', () => {
const html = await response.text();
// SvelteKit outputs content-hashed bundle names in _app/immutable/
expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"/);
expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9-]+\.css"/);
expect(html).toMatch(/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"\)/);
expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/);
expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/);
expect(html).toMatch(/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"\)/);
});
});

View File

@@ -3,6 +3,7 @@ import { Client } from '@modelcontextprotocol/sdk/client';
import { MCPService } from '$lib/services/mcp.service';
import { MCPConnectionPhase, MCPTransportType } from '$lib/enums';
import type { MCPConnectionLog, MCPServerConfig } from '$lib/types';
import { CORS_PROXY_HEADER_PREFIX } from '$lib/constants';
type DiagnosticFetchFactory = (
serverName: string,
@@ -16,11 +17,12 @@ type DiagnosticFetchFactory = (
const createDiagnosticFetch = (
config: MCPServerConfig,
onLog?: (log: MCPConnectionLog) => void,
baseInit: RequestInit = {}
baseInit: RequestInit = {},
useProxy = false
) =>
(
MCPService as unknown as { createDiagnosticFetch: DiagnosticFetchFactory }
).createDiagnosticFetch('test-server', config, baseInit, new URL(config.url), false, onLog);
).createDiagnosticFetch('test-server', config, baseInit, new URL(config.url), useProxy, onLog);
describe('MCPService', () => {
afterEach(() => {
@@ -94,6 +96,64 @@ describe('MCPService', () => {
});
});
it('wraps dynamic request headers when using the CORS proxy', async () => {
const logs: MCPConnectionLog[] = [];
const proxiedAuthToken = `${CORS_PROXY_HEADER_PREFIX}x-auth-token`;
const proxiedContentType = `${CORS_PROXY_HEADER_PREFIX}content-type`;
const proxiedSessionId = `${CORS_PROXY_HEADER_PREFIX}mcp-session-id`;
const response = new Response('{}', {
status: 200,
headers: { 'content-type': 'application/json' }
});
const fetchMock = vi.fn().mockResolvedValue(response);
vi.stubGlobal('fetch', fetchMock);
const config: MCPServerConfig = {
url: 'https://example.com/mcp',
transport: MCPTransportType.STREAMABLE_HTTP,
useProxy: true
};
const controller = createDiagnosticFetch(
config,
(log) => logs.push(log),
{
headers: {
authorization: 'Bearer llama-server-key',
[proxiedAuthToken]: 'target-token'
}
},
true
);
await controller.fetch('http://localhost:8080/cors-proxy?url=https%3A%2F%2Fexample.com%2Fmcp', {
method: 'POST',
headers: {
'content-type': 'application/json',
'mcp-session-id': 'session-request-12345'
},
body: '{}'
});
const sentHeaders = fetchMock.mock.calls[0]?.[1]?.headers as Headers;
expect(sentHeaders.get('authorization')).toBe('Bearer llama-server-key');
expect(sentHeaders.get(proxiedAuthToken)).toBe('target-token');
expect(sentHeaders.get(proxiedContentType)).toBe('application/json');
expect(sentHeaders.get(proxiedSessionId)).toBe('session-request-12345');
expect(sentHeaders.has('content-type')).toBe(false);
expect(sentHeaders.has('mcp-session-id')).toBe(false);
expect(logs[0].details).toMatchObject({
request: {
headers: {
authorization: '[redacted]',
[proxiedAuthToken]: '[redacted]',
[proxiedSessionId]: '....12345'
}
}
});
});
it('partially redacts mcp-session-id in diagnostic request and response logs', async () => {
const logs: MCPConnectionLog[] = [];
const response = new Response('{}', {

View File

@@ -1,5 +1,6 @@
import { describe, expect, it } from 'vitest';
import { sanitizeHeaders } from '$lib/utils/api-headers';
import { CORS_PROXY_HEADER_PREFIX } from '$lib/constants';
describe('sanitizeHeaders', () => {
it('returns empty object for undefined input', () => {
@@ -52,4 +53,21 @@ describe('sanitizeHeaders', () => {
const result = sanitizeHeaders(headers, ['X-CUSTOM-TOKEN']);
expect(result['x-custom-token']).toBe('[redacted]');
});
it('redacts proxied sensitive and custom target headers', () => {
const proxiedAuthorization = `${CORS_PROXY_HEADER_PREFIX}authorization`;
const proxiedSessionId = `${CORS_PROXY_HEADER_PREFIX}mcp-session-id`;
const proxiedVendorKey = `${CORS_PROXY_HEADER_PREFIX}x-vendor-key`;
const headers = new Headers({
[proxiedAuthorization]: 'Bearer secret',
[proxiedSessionId]: 'session-12345',
[proxiedVendorKey]: 'vendor-secret'
});
const partial = new Map([['mcp-session-id', 5]]);
const result = sanitizeHeaders(headers, ['x-vendor-key'], partial);
expect(result[proxiedAuthorization]).toBe('[redacted]');
expect(result[proxiedSessionId]).toBe('....12345');
expect(result[proxiedVendorKey]).toBe('[redacted]');
});
});