llama : use LLM_KV for quantization_version & file_type (#24802 )

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
arg: try fixing test-args-parser randomly fails (#24826 )
2026-06-20 23:02:28 +02:00 · 2026-06-20 20:07:01 +02:00 · 2026-06-20 19:45:27 +02:00 · 2026-06-20 23:08:59 +08:00 · 2026-06-20 15:34:47 +02:00 · 2026-06-20 05:54:42 -05:00
41 changed files with 459 additions and 213 deletions
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -4,20 +4,6 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 ### Build Llama.cpp stage
 FROM docker.io/gcc:${GCC_VERSION} AS build

@@ -34,8 +20,6 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
 WORKDIR /app
 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN --mount=type=cache,target=/root/.ccache \
    --mount=type=cache,target=/app/build \
    cmake -S . -B build -G Ninja \
--- a/.dockerignore
+++ b/.dockerignore
@@ -11,7 +11,6 @@
 build*/

 tools/ui/node_modules/
-tools/ui/dist/

 models/*

--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -58,6 +58,13 @@ jobs:
          git tag ${{ steps.srctag.outputs.name }} || exit 0
          git push origin ${{ steps.srctag.outputs.name }} || exit 0

+  build_ui:
+    name: Build UI
+    needs: create_tag
+    uses: ./.github/workflows/ui-build.yml
+    with:
+      hf_ui_version: ${{ needs.create_tag.outputs.source_tag }}
+
  prepare_matrices:
    name: Prepare Docker matrices
    runs-on: ubuntu-24.04
@@ -79,7 +86,7 @@ jobs:
          [
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
+            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x", "prebuilt_ui": true },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
@@ -135,7 +142,7 @@ jobs:

  push_to_registry:
    name: Push Docker image to Docker Registry
-    needs: [prepare_matrices, create_tag]
+    needs: [prepare_matrices, create_tag, build_ui]

    runs-on: ${{ matrix.config.runs_on }}
    strategy:
@@ -150,6 +157,13 @@ jobs:
          fetch-depth: 0
          ref: ${{ needs.create_tag.outputs.source_tag }}

+      - name: Download prebuilt UI
+        if: ${{ matrix.config.prebuilt_ui == true }}
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
+        with:
+          name: ui-build
+          path: tools/ui/dist
+
      - name: Set up QEMU
        if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1627,6 +1627,7 @@ jobs:
            **Windows:**
            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
            - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
+            - [Windows arm64 (OpenCL Adreno)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-opencl-adreno-arm64.zip)
            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -303,7 +303,6 @@ static handle_model_result common_params_handle_model(struct common_params_model

    if (!model.docker_repo.empty()) {
        model.path = common_docker_resolve_model(model.docker_repo);
-        model.name = model.docker_repo;
    } else if (!model.hf_repo.empty()) {
        // If -m was used with -hf, treat the model "path" as the hf_file to download
        if (model.hf_file.empty() && !model.path.empty()) {
@@ -323,7 +322,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
            throw std::runtime_error("failed to download model from Hugging Face");
        }

-        model.name = model.hf_repo;
        model.path = download_result.model_path;

        if (!download_result.mmproj_path.empty()) {
@@ -926,8 +924,8 @@ static utf8_argv make_utf8_argv() {
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
 #ifdef _WIN32
    auto utf8 = make_utf8_argv();
-    if (!utf8.ptrs.empty()) {
-        argc = static_cast<int>(utf8.buf.size());
+    // repair argv only when it matches the process command line
+    if (static_cast<int>(utf8.buf.size()) == argc) {
        argv = utf8.ptrs.data();
    }
 #endif
@@ -2899,7 +2897,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.server_tools = parse_csv_row(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
-        add_opt(common_arg(
+    add_opt(common_arg(
        {"-ag", "--agent"},
        {"-no-ag", "--no-agent"},
        "whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",
--- a/common/common.h
+++ b/common/common.h
@@ -295,7 +295,16 @@ struct common_params_model {
    std::string hf_repo     = ""; // HF repo                                                // NOLINT
    std::string hf_file     = ""; // HF file                                                // NOLINT
    std::string docker_repo = ""; // Docker repo                                            // NOLINT
-    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
+
+    std::string get_name() {
+        if (!hf_repo.empty()) {
+            return hf_repo;
+        }
+        if (!docker_repo.empty()) {
+            return docker_repo;
+        }
+        return path;
+    }
 };

 // draft-model-based speculative decoding parameters
--- a/conversion/bailingmoe.py
+++ b/conversion/bailingmoe.py
@@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
        if (rope_dim := hparams.get("head_dim")) is None:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]

-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -1119,8 +1119,10 @@ class TextModel(ModelBase):

        rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
        local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
+        partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
+        original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)

-        # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
+        # Ensure global params are mirrored in rope_parameters
        if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
            if local_rope_theta is not None:
                self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
@@ -1128,6 +1130,10 @@ class TextModel(ModelBase):
                self.rope_parameters["rope_theta"] = rope_theta
            if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
                self.rope_parameters["rope_type"] = rope_type
+            if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
+                self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+            if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
+                self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings

    @classmethod
    def __init_subclass__(cls):
--- a/conversion/chatglm.py
+++ b/conversion/chatglm.py
@@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
            rope_dim = self.hparams["attention_dim"]
        else:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_add_bos_token(False)
        rope_freq = 10000
        if "rope_ratio" in self.hparams:
--- a/conversion/deci.py
+++ b/conversion/deci.py
@@ -161,7 +161,7 @@ class DeciModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
--- a/conversion/exaone.py
+++ b/conversion/exaone.py
@@ -24,7 +24,7 @@ class ExaoneModel(TextModel):

        assert (hparams["activation_function"] == "silu")

-        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
+        rotary_factor = self.rope_parameters.get("partial_rotary_factor")
        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))

@@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
                factor = rope_params.get("factor", 16.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
--- a/conversion/gemma.py
+++ b/conversion/gemma.py
@@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
            self.gguf_writer.add_head_count_kv(value_arr)

        # handle n_rot differently for global vs swa layers
-        partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
        n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
        n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
        self.gguf_writer.add_rope_dimension_count(n_rot_full)
--- a/conversion/glm.py
+++ b/conversion/glm.py
@@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
                self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
            )
        self.gguf_writer.add_rope_dimension_count(
-            int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
+            int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
        )

        # MoE parameters - Use only routed expert count (shared experts handled separately)
@@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
        super().set_gguf_parameters()

        rope_dim = self.hparams["qk_rope_head_dim"]
-        partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
        self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))

        # NextN/MTP prediction layers
--- a/conversion/llama.py
+++ b/conversion/llama.py
@@ -289,7 +289,7 @@ class LlamaModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
--- a/conversion/mimo.py
+++ b/conversion/mimo.py
@@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
        self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])

-        rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
+        rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
        self.gguf_writer.add_rope_dimension_count(rope_dim)

        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
--- a/conversion/minicpm.py
+++ b/conversion/minicpm.py
@@ -32,11 +32,9 @@ class MiniCPMModel(TextModel):
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]

-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is not None:
-            long_factors = rope_scaling.get('long_factor', None)
-            short_factors = rope_scaling.get('short_factor', None)
-
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if long_factors or short_factors:
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -85,13 +83,11 @@ class MiniCPM3Model(TextModel):
        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is not None:
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if long_factors or short_factors:
            rope_dims = self.hparams["qk_rope_head_dim"]

-            long_factors = rope_scaling.get('long_factor', None)
-            short_factors = rope_scaling.get('short_factor', None)
-
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

--- a/conversion/nemotron.py
+++ b/conversion/nemotron.py
@@ -125,17 +125,18 @@ class NemotronModel(TextModel):
        self.gguf_writer.add_layer_norm_eps(f_norm_eps)

        # * Partial RoPE
-        rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
+        rot_pct = self.rope_parameters["partial_rotary_factor"]
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)

        # * RopeScaling for Nemotron
-        if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
+        factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
+        if factor is None:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        else:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
+            self.gguf_writer.add_rope_scaling_factor(factor)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
--- a/conversion/phi.py
+++ b/conversion/phi.py
@@ -18,7 +18,7 @@ class Phi2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI2

    def set_gguf_parameters(self):
-        rot_pct = self.find_hparam(["partial_rotary_factor"])
+        rot_pct = self.rope_parameters["partial_rotary_factor"]
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])

@@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
        rms_eps = self.find_hparam(["rms_norm_eps"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
+        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        self.gguf_writer.add_context_length(max_pos_embds)
@@ -174,18 +174,19 @@ class Phi3MiniModel(TextModel):
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
+        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        # write rope scaling for long context (128k) model
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is None:
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if not long_factors:
            return

        scale = max_pos_embds / orig_max_pos_embds

-        rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
+        rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')

@@ -198,9 +199,6 @@ class Phi3MiniModel(TextModel):

        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)

-        long_factors = rope_scaling.get('long_factor', None)
-        short_factors = rope_scaling.get('short_factor', None)
-
        if long_factors is None or short_factors is None:
            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

--- a/conversion/qwen.py
+++ b/conversion/qwen.py
@@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
        self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
        if (rope_dim := self.hparams.get("head_dim")) is None:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))

    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
--- a/conversion/stablelm.py
+++ b/conversion/stablelm.py
@@ -28,7 +28,7 @@ class StableLMModel(TextModel):
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
+        rotary_factor = self.rope_parameters["partial_rotary_factor"]
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
--- a/conversion/step3.py
+++ b/conversion/step3.py
@@ -314,7 +314,7 @@ class Step35Model(TextModel):
        factor = float(rope_params.get("factor", 8.0))
        low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
        high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
-        old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
+        old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))

        low_freq_wavelen = old_context_len / low_freq_factor
        high_freq_wavelen = old_context_len / high_freq_factor
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -2417,15 +2417,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
            // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
            GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);

-            parallel_for_ggml(params, n_batch, [&](int begin, int end) {
-                for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
+            parallel_for_ggml(params, n_batch * M, [&](int begin, int end) {
+                for (int idx = begin; idx < end; ++idx) {
+                    int batch_idx = idx / M;
+                    int m         = idx % M;
                    int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
                    const float * A_data = (const float *)((const char *)src1->data + src1_offset);
                    char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
-
-                    for (int m = 0; m < M; ++m) {
-                        from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
-                    }
+                    from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
                }
            });
        });
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -3788,7 +3788,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
    ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
 }

-static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
+static void ggml_backend_webgpu_request_adapter(wgpu::Instance & instance, wgpu::Adapter & adapter) {
    wgpu::RequestAdapterOptions options = {};

 #ifndef __EMSCRIPTEN__
@@ -3800,17 +3800,20 @@ static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
    options.nextInChain                   = &adapterTogglesDesc;
 #endif

-    ctx->webgpu_global_ctx->instance.WaitAny(
-        ctx->webgpu_global_ctx->instance.RequestAdapter(
-            &options, wgpu::CallbackMode::AllowSpontaneous,
-            [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
-                if (status != wgpu::RequestAdapterStatus::Success) {
-                    GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
-                    return;
-                }
-                ctx->webgpu_global_ctx->adapter = std::move(adapter);
-            }),
-        UINT64_MAX);
+    instance.WaitAny(instance.RequestAdapter(
+                         &options, wgpu::CallbackMode::AllowSpontaneous,
+                         [&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
+                             if (status != wgpu::RequestAdapterStatus::Success) {
+                                 GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
+                                 return;
+                             }
+                             adapter = std::move(_adapter);
+                         }),
+                     UINT64_MAX);
+}
+
+static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
+    ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, ctx->webgpu_global_ctx->adapter);
    GGML_ASSERT(ctx->webgpu_global_ctx->adapter != nullptr);

    ctx->webgpu_global_ctx->adapter.GetLimits(&ctx->webgpu_global_ctx->capabilities.limits);
@@ -4543,20 +4546,7 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
    // Probe for adapter support
    wgpu::Adapter adapter;
    if (ctx->webgpu_global_ctx->instance != nullptr) {
-        wgpu::RequestAdapterOptions options = {};
-
-        // probe for adapter support
-        ctx->webgpu_global_ctx->instance.WaitAny(
-            ctx->webgpu_global_ctx->instance.RequestAdapter(
-                &options, wgpu::CallbackMode::AllowSpontaneous,
-                [&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
-                    if (status != wgpu::RequestAdapterStatus::Success) {
-                        GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
-                        return;
-                    }
-                    adapter = std::move(_adapter);
-                }),
-            UINT64_MAX);
+        ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, adapter);
    }

    // WebGPU backend requires f16 support and, on native, implicit device synchronization.
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -932,8 +932,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::

    // copy the KV pairs from the input file
    gguf_set_kv     (ctx_out.get(), ml.metadata);
-    gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
-    gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
+    gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION).c_str(), GGML_QNT_VERSION);
+    gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_FILE_TYPE).c_str(), ftype);

    // Remove split metadata
    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
--- a/src/models/glm-dsa.cpp
+++ b/src/models/glm-dsa.cpp
@@ -101,11 +101,11 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);

        // DSA indexer
-        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags);
-        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags);
-        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags);
-        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags);
-        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
+        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
+        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
+        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags | TENSOR_NOT_REQUIRED);
+        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
+        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
        if (i < (int) hparams.n_layer_dense_lead) {
            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -10,7 +10,7 @@
 #undef NDEBUG
 #include <cassert>

-int main(void) {
+static void test(void) {
    common_params params;

    printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
@@ -210,3 +210,13 @@ int main(void) {

    printf("test-arg-parser: all tests OK\n\n");
 }
+
+int main(void) {
+    try {
+        test();
+    } catch (std::exception & e) {
+        fprintf(stderr, "test-arg-parser: exception: %s\n", e.what());
+        return 1;
+    }
+    return 0;
+}
--- a/tools/server/README-dev.md
+++ b/tools/server/README-dev.md
@@ -180,6 +180,17 @@ That requires `JSON.stringify` when formatted to message content:
 }
 ```

+### Router mode: how child <--> router communicates
+
+Upon spawning a new child process using `subprocess`, both child and router listen to the stdout/stderr (combined)
+
+For the direction from child to router:
+- Generic messages are logs, it will be forwarded to router's stdout
+- Special state update messages are prefixed by `cmd_child_to_router:state:`, followed by a JSON. See `server_models::handle_child_state` for more
+
+For the direction from router to child:
+- When server sends `cmd_router_to_child:exit`, the child should exit gracefully --> if after `DEFAULT_STOP_TIMEOUT` and the child is still running, force-kill it
+
 ### Model management API (router mode)

 Model management API was added via PR [#23976](https://github.com/ggml-org/llama.cpp/pull/23976)
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -63,11 +63,6 @@ enum slot_state {
    SLOT_STATE_GENERATING,
 };

-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-};
-
 struct server_slot {
    int id;

@@ -773,6 +768,8 @@ public:
    // note: chat_params must not be refreshed upon existing sleeping state
    server_chat_params chat_params;

+    server_state_callback_t callback_state = [](server_state, json) -> void {};
+
    server_context_impl() {
        mtmd_helper_log_set(common_log_default_callback, nullptr);
    }
@@ -1244,8 +1241,8 @@ private:
        if (!params_base.model_alias.empty()) {
            // backward compat: use first alias as model name
            model_name = *params_base.model_alias.begin();
-        } else if (!params_base.model.name.empty()) {
-            model_name = params_base.model.name;
+        } else if (!params_base.model.get_name().empty()) {
+            model_name = params_base.model.get_name();
        } else {
            // fallback: derive model name from file name
            auto model_path = std::filesystem::path(params_base.model.path);
@@ -3734,8 +3731,11 @@ struct server_res_generator : server_http_res {
    }
 };

-void server_context::on_sleeping_changed(std::function<void(bool)> callback) {
-    impl->queue_tasks.on_sleeping_state(std::move(callback));
+void server_context::set_state_callback(server_state_callback_t callback) {
+    impl->callback_state = std::move(callback);
+    impl->queue_tasks.on_sleeping_state([this](bool sleeping) {
+        impl->callback_state(sleeping ? SERVER_STATE_SLEEPING : SERVER_STATE_READY, {});
+    });
 }

 // compute the number of tokens before the last user message in the prompt
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -52,6 +52,31 @@ struct server_context_meta {
    uint64_t model_size;
 };

+enum server_state {
+    // SERVER_STATE_DOWNLOADING,
+    SERVER_STATE_LOADING,
+    SERVER_STATE_READY,
+    SERVER_STATE_SLEEPING,
+};
+
+static std::string server_state_to_str(server_state state) {
+    switch (state) {
+        case SERVER_STATE_LOADING:     return "loading";
+        case SERVER_STATE_READY:       return "ready";
+        case SERVER_STATE_SLEEPING:    return "sleeping";
+        default: GGML_ASSERT(false && "invalid server_state");
+    }
+}
+
+static server_state server_state_from_str(const std::string & str) {
+    if (str == "loading")     return SERVER_STATE_LOADING;
+    if (str == "ready")       return SERVER_STATE_READY;
+    if (str == "sleeping")    return SERVER_STATE_SLEEPING;
+    GGML_ASSERT(false && "invalid server_state string");
+}
+
+using server_state_callback_t = std::function<void(server_state, json /* payload */)>;
+
 struct server_context {
    std::unique_ptr<server_context_impl> impl;

@@ -79,9 +104,8 @@ struct server_context {
    // not thread-safe, should only be used from the main thread
    server_context_meta get_meta() const;

-    // register a callback to be called when sleeping state changes
-    // must be set before load_model() is called
-    void on_sleeping_changed(std::function<void(bool)> callback);
+    // note: must be set before load_model() is called
+    void set_state_callback(server_state_callback_t callback);
 };


--- a/tools/server/server-cors-proxy.h
+++ b/tools/server/server-cors-proxy.h
@@ -7,9 +7,18 @@
 #include <unordered_set>
 #include <list>
 #include <map>
+#include <algorithm>
+#include <cctype>

 #include "server-http.h"

+static std::string proxy_header_to_lower(std::string header) {
+    std::transform(header.begin(), header.end(), header.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    return header;
+}
+
 static server_http_res_ptr proxy_request(const server_http_req & req, std::string method) {
    std::string target_url = req.get_param("url");
    common_http_url parsed_url = common_http_parse_url(target_url);
@@ -33,11 +42,18 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
    SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());

    std::map<std::string, std::string> headers;
+    const std::string proxy_header_prefix = "x-llama-server-proxy-header-";
    for (auto [key, value] : req.headers) {
-        auto new_key = key;
-        if (string_starts_with(new_key, "x-proxy-header-")) {
-            string_replace_all(new_key, "x-proxy-header-", "");
+        const std::string lowered_key = proxy_header_to_lower(key);
+        if (!string_starts_with(lowered_key, proxy_header_prefix)) {
+            continue;
        }
+
+        auto new_key = key.substr(proxy_header_prefix.size());
+        if (new_key.empty()) {
+            continue;
+        }
+
        headers[new_key] = value;
    }

--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -1,5 +1,6 @@
 #include "server-common.h"
 #include "server-models.h"
+#include "server-context.h"

 #include "build-info.h"
 #include "preset.h"
@@ -44,9 +45,7 @@ extern char **environ;
 #define DEFAULT_STOP_TIMEOUT 10 // seconds

 #define CMD_ROUTER_TO_CHILD_EXIT  "cmd_router_to_child:exit"
-#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
-#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
-#define CMD_CHILD_TO_ROUTER_INFO  "cmd_child_to_router:info:" // followed by json string
+#define CMD_CHILD_TO_ROUTER_STATE "cmd_child_to_router:state:" // followed by json string

 // address for child process, this is needed because router may run on 0.0.0.0
 // ref: https://github.com/ggml-org/llama.cpp/issues/17862
@@ -904,12 +903,8 @@ void server_models::load(const std::string & name) {
                while (fgets(buffer, vec_buf.size(), stdout_file) != nullptr) {
                    LOG("[%5d] %s", port, buffer);
                    std::string str(buffer);
-                    if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
-                        this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
-                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
-                        this->update_loaded_info(name, str);
-                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
-                        this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
+                    if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_STATE)) {
+                        this->handle_child_state(name, str);
                    }
                }
            } else {
@@ -976,7 +971,10 @@ void server_models::load(const std::string & name) {
        subprocess_destroy(&child_proc->get());

        // update status and exit code
-        this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code);
+        this->update_status(name, {
+            SERVER_MODEL_STATUS_UNLOADED,
+            exit_code
+        });
        SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
    });

@@ -1016,7 +1014,8 @@ struct server_models_download_res : public common_download_callback {
            common_download_model(model, opts);
            is_ok = true;
        } catch (const std::exception & e) {
-            SRV_ERR("download failed for model name=%s: %s\n", model.name.c_str(), e.what());
+            auto model_name = model.get_name();
+            SRV_ERR("download failed for model name=%s: %s\n", model_name.c_str(), e.what());
            is_ok = false;
        }
        return is_ok;
@@ -1036,7 +1035,7 @@ struct server_models_download_res : public common_download_callback {
 };

 void server_models::download(common_params_model && model, common_download_opts && opts) {
-    std::string name = model.name;
+    std::string name = model.get_name();
    GGML_ASSERT(name == model.hf_repo);

    std::unique_lock<std::mutex> lk(mutex);
@@ -1064,9 +1063,10 @@ void server_models::download(common_params_model && model, common_download_opts
    inst.th = std::thread([this, dl = std::move(dl)]() {
        dl->opts.callback = dl.get();
        bool ok = dl->run();
+        auto model_name = dl->model.get_name();
        SRV_INF("download finished for model name=%s with status=%s\n",
-                    dl->model.name.c_str(), ok ? "success" : "failure");
-        update_download_progress(dl->model.name, {}, true, ok);
+                    model_name.c_str(), ok ? "success" : "failure");
+        update_download_progress(model_name, {}, true, ok);
        // need_reload is set inside update_download_progress under the mutex;
        // the next load_models() call will clean up this instance
    });
@@ -1130,21 +1130,27 @@ void server_models::unload_all() {
    }
 }

-void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
+void server_models::update_status(const std::string & name, const update_status_args & args) {
    std::unique_lock<std::mutex> lk(mutex);
    auto it = mapping.find(name);
    if (it != mapping.end()) {
        auto & meta = it->second.meta;
-        meta.status    = status;
-        meta.exit_code = exit_code;
+        meta.status      = args.status;
+        meta.exit_code   = args.exit_code;
+        if (!args.loaded_info.is_null()) {
+            meta.loaded_info = args.loaded_info;
+        }
    }
    // broadcast status change to SSE
    {
        json data = {
-            {"status", server_model_status_to_string(status)},
+            {"status", server_model_status_to_string(args.status)},
        };
-        if (status == SERVER_MODEL_STATUS_UNLOADED) {
-            data["exit_code"] = exit_code;
+        if (args.status == SERVER_MODEL_STATUS_UNLOADED) {
+            data["exit_code"] = args.exit_code;
+        }
+        if (!args.loaded_info.is_null()) {
+            data["info"] = args.loaded_info;
        }
        // note: notify_sse doesn't acquire the lock, so no deadlock here
        notify_sse("status_change", name, data);
@@ -1152,29 +1158,6 @@ void server_models::update_status(const std::string & name, server_model_status
    cv.notify_all();
 }

-void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
-    if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
-        SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
-        return;
-    }
-
-    json info;
-    try {
-        info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
-    } catch (const std::exception & e) {
-        SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
-        return;
-    }
-
-    std::unique_lock<std::mutex> lk(mutex);
-    auto it = mapping.find(name);
-    if (it != mapping.end()) {
-        auto & meta = it->second.meta;
-        meta.loaded_info = info;
-    }
-    cv.notify_all();
-}
-
 void server_models::update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok) {
    json curr;
    {
@@ -1323,21 +1306,54 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
    return proxy;
 }

-bool server_models::is_child_server() {
+void server_models::handle_child_state(const std::string & name, const std::string & raw_input) {
+    server_state state;
+    json payload;
+
+    try {
+        json data = json::parse(raw_input.substr(strlen(CMD_CHILD_TO_ROUTER_STATE)));
+        state = server_state_from_str(json_value(data, "state", std::string()));
+        payload = json_value(data, "payload", json{});
+    } catch (const std::exception & e) {
+        SRV_ERR("failed to parse child state update for name=%s: %s\n", name.c_str(), e.what());
+        return;
+    }
+
+    switch (state) {
+        case SERVER_STATE_LOADING:
+            {
+                // do nothing for now
+                // TODO: report loading progress for first load and wakeup from sleep
+            } break;
+        case SERVER_STATE_READY:
+            {
+                update_status(name, {
+                    SERVER_MODEL_STATUS_LOADED,
+                    0,
+                    // note: payload can be empty if this is a wakeup from sleep
+                    payload.size() > 0 ? payload : nullptr
+                });
+            } break;
+        case SERVER_STATE_SLEEPING:
+            {
+                update_status(name, { SERVER_MODEL_STATUS_SLEEPING });
+            } break;
+        default:
+            // should never happen, but just in case
+            GGML_ASSERT(false && "unexpected state from child server");
+    }
+}
+
+//
+// server_child
+//
+
+bool server_child::is_child() {
    const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
    return router_port != nullptr;
 }

-std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
-    // send a notification to the router server that a model instance is ready
-    common_log_pause(common_log_main());
-    fflush(stdout);
-    fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
-    fflush(stdout);
-    fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
-    fflush(stdout);
-    common_log_resume(common_log_main());
-
+std::thread server_child::setup(const std::function<void(int)> & shutdown_handler) {
    // setup thread for monitoring stdin
    return std::thread([shutdown_handler]() {
        // wait for EOF on stdin
@@ -1363,10 +1379,14 @@ std::thread server_models::setup_child_server(const std::function<void(int)> & s
    });
 }

-void server_models::notify_router_sleeping_state(bool is_sleeping) {
+void server_child::notify_to_router(const std::string & state, const json & payload) {
+    json data = {
+        {"state", state},
+        {"payload", payload},
+    };
    common_log_pause(common_log_main());
    fflush(stdout);
-    fprintf(stdout, "%s\n", is_sleeping ? CMD_CHILD_TO_ROUTER_SLEEP : CMD_CHILD_TO_ROUTER_READY);
+    fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_STATE, safe_json_to_str(data).c_str());
    fflush(stdout);
    common_log_resume(common_log_main());
 }
@@ -1644,7 +1664,6 @@ void server_models_routes::init_routes() {
        common_params_model model;
        common_download_opts opts;

-        model.name           = name;
        model.hf_repo        = name;
        opts.bearer_token    = params.hf_token;
        opts.download_mmproj = true;
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -171,8 +171,12 @@ public:
    void download(common_params_model && model, common_download_opts && opts);

    // update the status of a model instance (thread-safe)
-    void update_status(const std::string & name, server_model_status status, int exit_code);
-    void update_loaded_info(const std::string & name, std::string & raw_info);
+    struct update_status_args {
+        server_model_status status;
+        int exit_code = 0; // only valid if status == UNLOADED
+        json loaded_info = nullptr;
+    };
+    void update_status(const std::string & name, const update_status_args & args);
    void update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok = true);

    // remove a cache model from disk and update the list (thread-safe)
@@ -193,15 +197,27 @@ public:
    // proxy an HTTP request to the model instance
    server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);

+    // handle message sent from server_child::notify_to_router()
+    // raw input must starts with CMD_CHILD_TO_ROUTER_STATE, followed by a JSON string
+    // this function is not thread-safe, must be called from instance's monitoring thread
+    // payload per state:
+    //     state = loading     -> payload = {} (TODO: add progress info)
+    //     state = ready       -> payload = model_info (json), or {} if wakeup from sleeping
+    //     state = sleeping    -> payload = {}
+    void handle_child_state(const std::string & name, const std::string & raw_input);
+};
+
+struct server_child {
    // return true if the current process is a child server instance
-    static bool is_child_server();
+    bool is_child();

-    // notify the router server that a model instance is ready
+    // register the shutdown_handler to be called by the router
    // return the monitoring thread (to be joined by the caller)
-    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);
+    std::thread setup(const std::function<void(int)> & shutdown_handler);

-    // notify the router server that the sleeping state has changed
-    static void notify_router_sleeping_state(bool sleeping);
+    // notify router server for status changes (e.g. loading, downloading, sleeping, etc.)
+    // message will be handled by server_models::handle_child_state() on the router side
+    void notify_to_router(const std::string & state_name, const json & payload);
 };

 struct server_models_routes {
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -90,8 +90,10 @@ int llama_server(int argc, char ** argv) {
    llama_numa_init(params.numa);

    // router server never loads a model and must not touch the GPU
+    const bool is_router_server = params.model.path.empty()
+                               && params.model.hf_repo.empty();
+
    // skip device enumeration so the CUDA primary context stays uncreated
-    const bool is_router_server = params.model.path.empty();
    common_params_print_info(params, !is_router_server);

    if (!is_router_server) {
@@ -113,8 +115,9 @@ int llama_server(int argc, char ** argv) {
    }

    // for consistency between server router mode and single-model mode, we set the same model name as alias
-    if (params.model_alias.empty() && !params.model.name.empty()) {
-        params.model_alias.insert(params.model.name);
+    auto model_name = params.model.get_name();
+    if (params.model_alias.empty() && !model_name.empty()) {
+        params.model_alias.insert(model_name);
    }

    // struct that contains llama context and inference
@@ -255,6 +258,7 @@ int llama_server(int argc, char ** argv) {
    // Start the server
    //

+    server_child child; // only used in non-router mode
    std::function<void()> clean_up;

    if (is_router_server) {
@@ -300,15 +304,16 @@ int llama_server(int argc, char ** argv) {
            return 1;
        }

-        // load the model
-        SRV_INF("%s", "loading model\n");
-
-        if (server_models::is_child_server()) {
-            ctx_server.on_sleeping_changed([&](bool sleeping) {
-                server_models::notify_router_sleeping_state(sleeping);
+        // setup communication child --> router if necessary
+        if (child.is_child()) {
+            ctx_server.set_state_callback([&](server_state state, json payload) {
+                child.notify_to_router(server_state_to_str(state), payload);
            });
        }

+        // load the model
+        SRV_INF("%s", "loading model\n");
+
        if (!ctx_server.load_model(params)) {
            clean_up();
            if (ctx_http.thread.joinable()) {
@@ -365,9 +370,9 @@ int llama_server(int argc, char ** argv) {

        // optionally, notify router server that this instance is ready
        std::thread monitor_thread;
-        if (server_models::is_child_server()) {
-            json model_info = routes.get_model_info();
-            monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
+        if (child.is_child()) {
+            monitor_thread = child.setup(shutdown_handler);
+            child.notify_to_router(server_state_to_str(SERVER_STATE_READY), routes.get_model_info());
        }

        // this call blocks the main thread until queue_tasks.terminate() is called
--- a/tools/server/tests/unit/test_security.py
+++ b/tools/server/tests/unit/test_security.py
@@ -1,6 +1,8 @@
 import pytest
 from openai import OpenAI
 from utils import *
+import threading
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer

 server = ServerPreset.tinyllama2()

@@ -105,6 +107,49 @@ def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
    assert res.headers[cors_header] == cors_header_value


+def test_cors_proxy_only_forwards_explicit_proxy_headers():
+    class CaptureHeadersHandler(BaseHTTPRequestHandler):
+        def do_GET(self):
+            self.server.captured_headers = dict(self.headers)
+            self.send_response(200)
+            self.end_headers()
+            self.wfile.write(b"ok")
+
+        def log_message(self, format, *args):
+            pass
+
+    target = ThreadingHTTPServer(("127.0.0.1", 0), CaptureHeadersHandler)
+    target.captured_headers = {}
+    target_thread = threading.Thread(target=target.serve_forever, daemon=True)
+    target_thread.start()
+
+    try:
+        server = ServerPreset.tinyllama2()
+        server.api_key = TEST_API_KEY
+        server.ui_mcp_proxy = True
+        server.start()
+
+        res = server.make_request("GET", f"/cors-proxy?url=http://127.0.0.1:{target.server_port}/capture", headers={
+            "Authorization": f"Bearer {TEST_API_KEY}",
+            "Proxy-Authorization": "Basic secret",
+            "X-Api-Key": TEST_API_KEY,
+            "Cookie": "session=secret",
+            "x-llama-server-proxy-header-accept": "application/json",
+            "x-llama-server-proxy-header-authorization": "Bearer explicit",
+        })
+
+        assert res.status_code == 200
+        captured = {key.lower(): value for key, value in target.captured_headers.items()}
+        assert captured["accept"] == "application/json"
+        assert captured["authorization"] == "Bearer explicit"
+        assert "proxy-authorization" not in captured
+        assert "x-api-key" not in captured
+        assert "cookie" not in captured
+    finally:
+        target.shutdown()
+        target.server_close()
+
+
@pytest.mark.parametrize(
    "media_path, image_url, success",
    [
--- a/tools/ui/src/lib/constants/mcp.ts
+++ b/tools/ui/src/lib/constants/mcp.ts
@@ -51,6 +51,9 @@ export const EXPECTED_THEMED_ICON_PAIR_COUNT = 2;
 /** CORS proxy URL query parameter name */
 export const CORS_PROXY_URL_PARAM = 'url';

+/** Header prefix for headers that should be forwarded by the CORS proxy */
+export const CORS_PROXY_HEADER_PREFIX = 'x-llama-server-proxy-header-';
+
 /** Number of trailing characters to keep visible when partially redacting mcp-session-id */
 export const MCP_SESSION_ID_VISIBLE_CHARS = 5;

--- a/tools/ui/src/lib/services/mcp.service.ts
+++ b/tools/ui/src/lib/services/mcp.service.ts
@@ -16,6 +16,7 @@ import {
 	DEFAULT_MCP_CONFIG,
 	DEFAULT_CLIENT_VERSION,
 	DEFAULT_IMAGE_MIME_TYPE,
+	CORS_PROXY_HEADER_PREFIX,
 	MCP_PARTIAL_REDACT_HEADERS,
 	CORS_PROXY_ENDPOINT
 } from '$lib/constants';
@@ -133,6 +134,20 @@ export class MCPService {
 		return details;
 	}

+	private static addRequestHeaders(
+		requestHeaders: Headers,
+		headers: HeadersInit,
+		useProxy: boolean
+	) {
+		for (const [key, value] of new Headers(headers).entries()) {
+			const proxiedKey =
+				useProxy && !key.toLowerCase().startsWith(CORS_PROXY_HEADER_PREFIX)
+					? `${CORS_PROXY_HEADER_PREFIX}${key}`
+					: key;
+			requestHeaders.set(proxiedKey, value);
+		}
+	}
+
 	private static summarizeError(error: unknown): Record<string, unknown> {
 		if (error instanceof Error) {
 			return {
@@ -271,15 +286,11 @@ export class MCPService {
 				const requestHeaders = new Headers(baseInit.headers);

 				if (typeof Request !== 'undefined' && input instanceof Request) {
-					for (const [key, value] of input.headers.entries()) {
-						requestHeaders.set(key, value);
-					}
+					this.addRequestHeaders(requestHeaders, input.headers, useProxy);
 				}

 				if (init?.headers) {
-					for (const [key, value] of new Headers(init.headers).entries()) {
-						requestHeaders.set(key, value);
-					}
+					this.addRequestHeaders(requestHeaders, init.headers, useProxy);
 				}

 				const request = this.createDiagnosticRequestDetails(
--- a/tools/ui/src/lib/utils/api-headers.ts
+++ b/tools/ui/src/lib/utils/api-headers.ts
@@ -1,5 +1,5 @@
 import { config } from '$lib/stores/settings.svelte';
-import { REDACTED_HEADERS } from '$lib/constants';
+import { CORS_PROXY_HEADER_PREFIX, REDACTED_HEADERS } from '$lib/constants';
 import { redactValue } from './redact';

 /**
@@ -52,11 +52,20 @@ export function sanitizeHeaders(

 	for (const [key, value] of normalized.entries()) {
 		const normalizedKey = key.toLowerCase();
-		const partialChars = partialRedactHeaders?.get(normalizedKey);
+		const unproxiedKey = normalizedKey.startsWith(CORS_PROXY_HEADER_PREFIX)
+			? normalizedKey.slice(CORS_PROXY_HEADER_PREFIX.length)
+			: normalizedKey;
+		const partialChars =
+			partialRedactHeaders?.get(normalizedKey) ?? partialRedactHeaders?.get(unproxiedKey);

 		if (partialChars !== undefined) {
 			sanitized[key] = redactValue(value, partialChars);
-		} else if (REDACTED_HEADERS.has(normalizedKey) || redactedHeaders.has(normalizedKey)) {
+		} else if (
+			REDACTED_HEADERS.has(normalizedKey) ||
+			REDACTED_HEADERS.has(unproxiedKey) ||
+			redactedHeaders.has(normalizedKey) ||
+			redactedHeaders.has(unproxiedKey)
+		) {
 			sanitized[key] = redactValue(value);
 		} else {
 			sanitized[key] = value;
--- a/tools/ui/src/lib/utils/cors-proxy.ts
+++ b/tools/ui/src/lib/utils/cors-proxy.ts
@@ -3,7 +3,11 @@
 */

 import { base } from '$app/paths';
-import { CORS_PROXY_ENDPOINT, CORS_PROXY_URL_PARAM } from '$lib/constants';
+import {
+	CORS_PROXY_ENDPOINT,
+	CORS_PROXY_HEADER_PREFIX,
+	CORS_PROXY_URL_PARAM
+} from '$lib/constants';

 /**
 * Build a proxied URL that routes through llama-server's CORS proxy.
@@ -28,7 +32,7 @@ export function buildProxiedHeaders(headers: Record<string, string>): Record<str
 	const proxiedHeaders: Record<string, string> = {};

 	for (const [key, value] of Object.entries(headers)) {
-		proxiedHeaders[`x-proxy-header-${key}`] = value;
+		proxiedHeaders[`${CORS_PROXY_HEADER_PREFIX}${key}`] = value;
 	}

 	return proxiedHeaders;
--- a/tools/ui/tests/e2e/pwa.e2e.ts
+++ b/tools/ui/tests/e2e/pwa.e2e.ts
@@ -39,8 +39,8 @@ test.describe('PWA Service Worker', () => {
 		const swContent = await swResponse.text();

 		// Precache contains SvelteKit content-hashed bundle paths
-		expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"/);
-		expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9-]+\.css"/);
+		expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/);
+		expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/);
 		expect(swContent).toMatch(/"manifest\.webmanifest"/);
 		expect(swContent).toMatch(/"_app\/version\.json"/);
 		expect(swContent).toMatch(/NavigationRoute/);
@@ -99,8 +99,8 @@ test.describe('PWA Service Worker', () => {
 		const html = await response.text();

 		// SvelteKit outputs content-hashed bundle names in _app/immutable/
-		expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"/);
-		expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9-]+\.css"/);
-		expect(html).toMatch(/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"\)/);
+		expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/);
+		expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/);
+		expect(html).toMatch(/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"\)/);
 	});
 });
--- a/tools/ui/tests/unit/mcp-service.test.ts
+++ b/tools/ui/tests/unit/mcp-service.test.ts
@@ -3,6 +3,7 @@ import { Client } from '@modelcontextprotocol/sdk/client';
 import { MCPService } from '$lib/services/mcp.service';
 import { MCPConnectionPhase, MCPTransportType } from '$lib/enums';
 import type { MCPConnectionLog, MCPServerConfig } from '$lib/types';
+import { CORS_PROXY_HEADER_PREFIX } from '$lib/constants';

 type DiagnosticFetchFactory = (
 	serverName: string,
@@ -16,11 +17,12 @@ type DiagnosticFetchFactory = (
 const createDiagnosticFetch = (
 	config: MCPServerConfig,
 	onLog?: (log: MCPConnectionLog) => void,
-	baseInit: RequestInit = {}
+	baseInit: RequestInit = {},
+	useProxy = false
 ) =>
 	(
 		MCPService as unknown as { createDiagnosticFetch: DiagnosticFetchFactory }
-	).createDiagnosticFetch('test-server', config, baseInit, new URL(config.url), false, onLog);
+	).createDiagnosticFetch('test-server', config, baseInit, new URL(config.url), useProxy, onLog);

 describe('MCPService', () => {
 	afterEach(() => {
@@ -94,6 +96,64 @@ describe('MCPService', () => {
 		});
 	});

+	it('wraps dynamic request headers when using the CORS proxy', async () => {
+		const logs: MCPConnectionLog[] = [];
+		const proxiedAuthToken = `${CORS_PROXY_HEADER_PREFIX}x-auth-token`;
+		const proxiedContentType = `${CORS_PROXY_HEADER_PREFIX}content-type`;
+		const proxiedSessionId = `${CORS_PROXY_HEADER_PREFIX}mcp-session-id`;
+		const response = new Response('{}', {
+			status: 200,
+			headers: { 'content-type': 'application/json' }
+		});
+		const fetchMock = vi.fn().mockResolvedValue(response);
+
+		vi.stubGlobal('fetch', fetchMock);
+
+		const config: MCPServerConfig = {
+			url: 'https://example.com/mcp',
+			transport: MCPTransportType.STREAMABLE_HTTP,
+			useProxy: true
+		};
+
+		const controller = createDiagnosticFetch(
+			config,
+			(log) => logs.push(log),
+			{
+				headers: {
+					authorization: 'Bearer llama-server-key',
+					[proxiedAuthToken]: 'target-token'
+				}
+			},
+			true
+		);
+
+		await controller.fetch('http://localhost:8080/cors-proxy?url=https%3A%2F%2Fexample.com%2Fmcp', {
+			method: 'POST',
+			headers: {
+				'content-type': 'application/json',
+				'mcp-session-id': 'session-request-12345'
+			},
+			body: '{}'
+		});
+
+		const sentHeaders = fetchMock.mock.calls[0]?.[1]?.headers as Headers;
+		expect(sentHeaders.get('authorization')).toBe('Bearer llama-server-key');
+		expect(sentHeaders.get(proxiedAuthToken)).toBe('target-token');
+		expect(sentHeaders.get(proxiedContentType)).toBe('application/json');
+		expect(sentHeaders.get(proxiedSessionId)).toBe('session-request-12345');
+		expect(sentHeaders.has('content-type')).toBe(false);
+		expect(sentHeaders.has('mcp-session-id')).toBe(false);
+		expect(logs[0].details).toMatchObject({
+			request: {
+				headers: {
+					authorization: '[redacted]',
+					[proxiedAuthToken]: '[redacted]',
+					[proxiedSessionId]: '....12345'
+				}
+			}
+		});
+	});
+
 	it('partially redacts mcp-session-id in diagnostic request and response logs', async () => {
 		const logs: MCPConnectionLog[] = [];
 		const response = new Response('{}', {
--- a/tools/ui/tests/unit/sanitize-headers.test.ts
+++ b/tools/ui/tests/unit/sanitize-headers.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it } from 'vitest';
 import { sanitizeHeaders } from '$lib/utils/api-headers';
+import { CORS_PROXY_HEADER_PREFIX } from '$lib/constants';

 describe('sanitizeHeaders', () => {
 	it('returns empty object for undefined input', () => {
@@ -52,4 +53,21 @@ describe('sanitizeHeaders', () => {
 		const result = sanitizeHeaders(headers, ['X-CUSTOM-TOKEN']);
 		expect(result['x-custom-token']).toBe('[redacted]');
 	});
+
+	it('redacts proxied sensitive and custom target headers', () => {
+		const proxiedAuthorization = `${CORS_PROXY_HEADER_PREFIX}authorization`;
+		const proxiedSessionId = `${CORS_PROXY_HEADER_PREFIX}mcp-session-id`;
+		const proxiedVendorKey = `${CORS_PROXY_HEADER_PREFIX}x-vendor-key`;
+		const headers = new Headers({
+			[proxiedAuthorization]: 'Bearer secret',
+			[proxiedSessionId]: 'session-12345',
+			[proxiedVendorKey]: 'vendor-secret'
+		});
+		const partial = new Map([['mcp-session-id', 5]]);
+		const result = sanitizeHeaders(headers, ['x-vendor-key'], partial);
+
+		expect(result[proxiedAuthorization]).toBe('[redacted]');
+		expect(result[proxiedSessionId]).toBe('....12345');
+		expect(result[proxiedVendorKey]).toBe('[redacted]');
+	});
 });
Author	SHA1	Message	Date
Adrien Gallouët	84de01a1f1	llama : use LLM_KV for quantization_version & file_type (#24802 ) Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-20 20:07:01 +02:00
Xuan-Son Nguyen	75f460ac28	arg: try fixing test-args-parser randomly fails (#24826 ) * arg: try fixing test-args-parser randomly fails * return ref * try triggering the workflow * exception wrapper * wip * test * test 2 * arg: guard win32 utf8 argv override make_utf8_argv rebuilds argv from GetCommandLineW to fix utf8 handling of non ascii arguments on windows. the override runs unconditionally inside common_params_parse, so it also clobbers a programmatic argv passed by a caller. test-arg-parser builds a synthetic argv but then sees the real process command line instead, the model argument is never parsed, and the assert that expects success aborts via fastfail (0xC0000409). this shows up as a random failure in the openvino windows workflow. only override argv when its length matches the caller argc, so the utf8 repair still applies to real binaries while a programmatic argv stays intact. --------- Co-authored-by: Pascal <admin@serveurperso.com>	2026-06-20 19:45:27 +02:00
Muhammad Salem	8452824611	release: add missing link for win opencl adreno arm64 (#24809 )	2026-06-20 23:08:59 +08:00
Matti4	e27f308597	server: avoid forwarding auth headers in CORS proxy (#24373 ) * server: avoid forwarding auth headers in CORS proxy * format * fix test * fix e2e test --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2026-06-20 15:34:47 +02:00
Aldehir Rojas	67e9fd3b74	docker : prebuild web UI for s390x build [no release] (#24829 )	2026-06-20 05:54:42 -05:00
davidrhodus	796f41bedc	model : glm-dsa load DSA indexer tensors as optional (#24770 ) GLM-5.2 ships the DSA "lightning indexer" on only a subset of layers (the "full" layers; others omit it), but the GLM_DSA loader created the five indexer tensors on every layer as required, so loading any GLM-5.2 GGUF failed with e.g. `missing tensor 'blk.3.indexer.k_norm.weight'`. GLM_DSA's graph is llama_model_deepseek2::graph (plain MLA) and does not use the indexer tensors (indexer runtime not yet implemented), so they are loaded-but-unused. Marking them TENSOR_NOT_REQUIRED lets layers without an indexer load as nullptr and the model runs as full MLA attention. DeepSeek-V3.2 (uniform indexer on all layers) is unaffected.	2026-06-20 13:48:24 +03:00
Adrien Gallouët	37a77fb057	ggml : optimize AMX (#24806 ) Flatten the partition over n_batch * M so every thread participates in the quantization \| CPU \| Model \| Test \| t/s OLD \| t/s NEW \| Speedup \| \|:--------------------------------\|:------------------------------\|:-------\|----------:\|----------:\|----------:\| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_NL - 4.5 bpw \| pp512 \| 730.71 \| 779.86 \| 1.07 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_NL - 4.5 bpw \| tg128 \| 87.88 \| 86.79 \| 0.99 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_XS - 4.25 bpw \| pp512 \| 725.09 \| 1023.31 \| 1.41 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_XS - 4.25 bpw \| tg128 \| 83.64 \| 83.62 \| 1.00 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_0 \| pp512 \| 820.51 \| 924.05 \| 1.13 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_0 \| tg128 \| 90.59 \| 92.46 \| 1.02 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_1 \| pp512 \| 776.88 \| 872.79 \| 1.12 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_1 \| tg128 \| 89.39 \| 90.94 \| 1.02 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_M \| pp512 \| 719.28 \| 1009.27 \| 1.40 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_M \| tg128 \| 80.62 \| 80.86 \| 1.00 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_S \| pp512 \| 732.29 \| 1077.29 \| 1.47 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_S \| tg128 \| 86.42 \| 83.53 \| 0.97 \| Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-20 13:43:06 +03:00
Sigbjørn Skjæret	f4043fec01	convert : more consistent handling of rope_parameters (#24833 )	2026-06-20 13:42:36 +03:00
Masashi Yoshimura	f449e05537	ggml-webgpu: add adapter toggles for F16 on Vulkan + NVIDIA	2026-06-20 08:12:32 +09:00
Xuan-Son Nguyen	2b686a9120	server: refactor child --> router communication (#24821 ) * server: refactor child --> router communication * fix wakeup case * add docs * improve update_status() * nits	2026-06-20 01:02:26 +02:00