common : allow --offline in llama download (#25091 )

Expose the existing --offline flag to `llama download` so a script can run it to check whether a model is already cached and ready to be served without touching the network. Also fix a latent use-after-free in the URL-task on_done callback: first_path is block-scoped and was captured by reference, but invoked after the block ends. Signed-off-by: Adrien Gallouët <angt@huggingface.co>
logs : reduce v2 (#25078 )
2026-06-29 02:33:03 +02:00 · 2026-06-28 12:34:11 +02:00 · 2026-06-28 08:52:15 +03:00 · 2026-06-27 15:36:06 -07:00 · 2026-06-27 17:46:21 +05:30 · 2026-06-27 12:13:43 +03:00
112 changed files with 9117 additions and 1182 deletions
@@ -145,7 +145,7 @@ ENTRYPOINT ["/app/tools.sh"]
 # ==============================================================================
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app

 ENTRYPOINT [ "/app/llama-cli" ]

@@ -156,7 +156,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama-server /app
+COPY --from=build /app/full/llama /app/full/llama-server /app

 HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]

@@ -104,7 +104,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -115,7 +115,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama-server /app
+COPY --from=build /app/full/llama /app/full/llama-server /app

 WORKDIR /app

@@ -113,7 +113,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -124,7 +124,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama-server /app
+COPY --from=build /app/full/llama /app/full/llama-server /app

 WORKDIR /app

@@ -141,7 +141,7 @@ ENTRYPOINT ["/app/tools.sh"]
 FROM base AS light

 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -153,7 +153,7 @@ FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0

 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-server /app
+COPY --from=build /app/full/llama /app/full/llama-server /app

 WORKDIR /app

@@ -115,7 +115,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -126,7 +126,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama-server /app
+COPY --from=build /app/full/llama /app/full/llama-server /app

 WORKDIR /app

@@ -1,12 +1,12 @@
-ARG OPENVINO_VERSION_MAJOR=2026.2
-ARG OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857
+ARG OPENVINO_VERSION_MAJOR=2026.2.1
+ARG OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3
 ARG UBUNTU_VERSION=24.04

 # Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
-ARG IGC_VERSION=v2.34.4
-ARG IGC_VERSION_FULL=2_2.34.4+21428
-ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
+ARG IGC_VERSION=v2.36.3
+ARG IGC_VERSION_FULL=2_2.36.3+21719
+ARG COMPUTE_RUNTIME_VERSION=26.22.38646.4
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.22.38646.4-0
 ARG IGDGMM_VERSION=22.10.0

 # Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
@@ -214,7 +214,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app/
+COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app/

 WORKDIR /app

@@ -225,7 +225,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama-server /app/
+COPY --from=build /app/full/llama /app/full/llama-server /app/

 WORKDIR /app

@@ -127,7 +127,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -138,7 +138,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama-server /app
+COPY --from=build /app/full/llama /app/full/llama-server /app

 WORKDIR /app

@@ -124,7 +124,7 @@ WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin

 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]

@@ -138,7 +138,7 @@ WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-server /llama.cpp/bin

 EXPOSE 8080

@@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -118,7 +118,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama-server /app
+COPY --from=build /app/full/llama /app/full/llama-server /app

 WORKDIR /app

@@ -97,7 +97,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -108,7 +108,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama-server /app
+COPY --from=build /app/full/llama /app/full/llama-server /app

 WORKDIR /app

@@ -68,8 +68,8 @@ jobs:

    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+      OPENVINO_VERSION_MAJOR: "2026.2.1"
+      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"

    steps:
      - name: Clone
@@ -96,8 +96,8 @@ jobs:

    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+      OPENVINO_VERSION_MAJOR: "2026.2.1"
+      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"

    steps:
      - name: Clone
@@ -39,8 +39,8 @@ jobs:

    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+      OPENVINO_VERSION_MAJOR: "2026.2.1"
+      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"

    steps:
      - name: Clone
@@ -96,8 +96,8 @@ jobs:

    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+      OPENVINO_VERSION_MAJOR: "2026.2.1"
+      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"

    steps:
      - name: Clone
@@ -266,8 +266,8 @@ jobs:

    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+      OPENVINO_VERSION_MAJOR: "2026.2.1"
+      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"

    steps:
      - name: Clone
@@ -446,8 +446,8 @@ jobs:

    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+      OPENVINO_VERSION_MAJOR: "2026.2.1"
+      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"

    steps:
      - name: Set OpenVINO version output
@@ -506,8 +506,11 @@ jobs:
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENVINO=ON \
-            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
-          cmake --build build/ReleaseOV --config Release -j $(nproc)
+            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build/ReleaseOV --config Release --parallel

      - name: ccache-clear
        uses: ./.github/actions/ccache-clear
@@ -521,8 +524,26 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          cp LICENSE ./build/ReleaseOV/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin .
+          dest=./build/ReleaseOV/bin
+          OPENVINO_ROOT=./openvino_toolkit
+          ov_lib="$OPENVINO_ROOT/runtime/lib/intel64"
+
+          # Bundle OpenVINO runtime libs + TBB. Binaries built with RPATH=$ORIGIN
+          # load these siblings without setupvars.sh / LD_LIBRARY_PATH.
+          cp -P "$ov_lib"/libopenvino.so* \
+                "$ov_lib"/libopenvino_c.so* \
+                "$ov_lib"/libopenvino_*_plugin.so \
+                "$ov_lib"/libopenvino_intel_npu_compiler*.so \
+                "$OPENVINO_ROOT"/runtime/3rdparty/tbb/lib/*.so* \
+                "$dest"
+          cp -P /usr/lib/x86_64-linux-gnu/libOpenCL.so.1* "$dest" 2>/dev/null || true
+          cp "$ov_lib"/cache.json "$dest" 2>/dev/null || true
+
+          # OpenVINO licensing
+          cp -r "$OPENVINO_ROOT"/docs/licensing "$dest"/openvino-licensing
+
+          cp LICENSE "$dest"
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C "$dest" .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -531,6 +552,9 @@ jobs:
          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz

  windows-openvino:
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
+
    runs-on: windows-2022

    outputs:
@@ -538,8 +562,8 @@ jobs:

    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+      OPENVINO_VERSION_MAJOR: "2026.2.1"
+      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"

    steps:
      - name: Set OpenVINO version output
@@ -607,7 +631,9 @@ jobs:
            -A x64 ^
            -DCMAKE_BUILD_TYPE=Release ^
            -DGGML_OPENVINO=ON ^
-            -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
+            -DLLAMA_BUILD_BORINGSSL=ON ^
+            -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^
+            ${{ env.CMAKE_ARGS }}

          cmake --build build\ReleaseOV --config Release -- /m

@@ -624,8 +650,29 @@ jobs:
        id: pack_artifacts
        shell: powershell
        run: |
-          Copy-Item LICENSE .\build\ReleaseOV\bin\
-          7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip .\build\ReleaseOV\bin\*
+          # Locate the extracted OpenVINO toolkit root (same pattern as the Build step).
+          $OPENVINO_ROOT = (Get-ChildItem -Directory openvino_toolkit | Select-Object -First 1).FullName
+          if (-not $OPENVINO_ROOT) {
+            Write-Error "OpenVINO toolkit folder not found under .\openvino_toolkit"
+            exit 1
+          }
+
+          $dest = ".\build\ReleaseOV\bin\Release"
+
+          $ovBin = Join-Path $OPENVINO_ROOT 'runtime\bin\intel64\Release'
+          Copy-Item -Path (Join-Path $ovBin '*.dll')       -Destination $dest -Force
+          Copy-Item -Path (Join-Path $ovBin 'cache.json')  -Destination $dest -Force
+
+          $tbbBin = Join-Path $OPENVINO_ROOT 'runtime\3rdparty\tbb\bin'
+          Copy-Item -Path (Join-Path $tbbBin 'tbb*.dll') -Destination $dest -Force
+
+          # OpenVINO licensing
+          $licensingDest = Join-Path $dest 'openvino-licensing'
+          New-Item -ItemType Directory -Force -Path $licensingDest | Out-Null
+          Copy-Item -Path (Join-Path $OPENVINO_ROOT 'docs\licensing\*') -Destination $licensingDest -Recurse -Force
+
+          Copy-Item LICENSE $dest
+          7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip $dest\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -80,7 +80,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
 ### Untrusted environments or networks

 If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
-* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
+* Do not use the RPC backend, [ggml-rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
 * Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
 * Encrypt your data if sending it over the network.

@@ -50,6 +50,7 @@ struct command {
    std::vector<std::string> aliases;
    bool hidden;
    int (*func)(int, char **);
+    bool flags = false; // allow --name
 };

 #ifdef LLAMA_INSTALL_BUILD
@@ -69,9 +70,9 @@ static const command cmds[] = {
    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,          llama_fit_params   },
    {"quantize",      "Quantize a model",                                   {},           true,          llama_quantize     },
    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,          llama_perplexity   },
-    {"version",       "Show version",                                       {},           false,         version            },
-    {"licenses",      "Show third-party licenses",                          {"credits"},  false,         licenses           },
-    {"help",          "Show available commands",                            {},           false,         help               },
+    {"version",       "Show version",                                       {},           false,         version,           true },
+    {"licenses",      "Show third-party licenses",                          {"credits"},  false,         licenses,          true },
+    {"help",          "Show available commands",                            {},           false,         help,              true },
 };

 #undef UPDATE_HIDDEN
@@ -108,7 +109,10 @@ static int help(int argc, char ** argv) {
    return 0;
 }

-static bool matches(const std::string & arg, const command & cmd) {
+static bool matches(std::string arg, const command & cmd) {
+    if (cmd.flags && arg.size() > 2 && arg[0] == '-' && arg[1] == '-') {
+        arg.erase(0, 2);
+    }
    if (arg == cmd.name) {
        return true;
    }
@@ -352,6 +352,8 @@ static std::string get_default_local_path(const std::string & url) {

 common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex) {
    common_download_hf_plan plan;
+    common_download_hf_plan plan_spec;
+    common_download_hf_plan plan_voc;
    common_download_opts opts;

    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
@@ -377,7 +379,15 @@ common_models_handler common_models_handler_init(const common_params & params, l
        plan = common_download_get_hf_plan(params.model, opts);
    }

-    return common_models_handler{plan, opts};
+    if (!params.speculative.draft.mparams.hf_repo.empty()) {
+        plan_spec = common_download_get_hf_plan(params.speculative.draft.mparams, opts);
+    }
+
+    if (!params.vocoder.model.hf_repo.empty()) {
+        plan_voc = common_download_get_hf_plan(params.vocoder.model, opts);
+    }
+
+    return common_models_handler{plan, plan_spec, plan_voc, opts};
 }

 bool common_models_handler_is_preset_repo(const common_models_handler & handler) {
@@ -425,7 +435,9 @@ static std::vector<common_download_task> build_url_tasks(const common_params_mod
 void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback) {
    std::vector<common_download_task> tasks;

-    auto & plan = handler.plan;
+    auto & plan      = handler.plan;
+    auto & plan_spec = handler.plan_spec;
+    auto & plan_voc  = handler.plan_voc;

    auto opts = handler.opts; // copy
    opts.callback = callback;
@@ -455,7 +467,7 @@ void common_models_handler_apply(common_models_handler & handler, common_params
        // the first part is what gets loaded, so point params.model.path at it
        if (!url_tasks.empty()) {
            std::string first_path = url_tasks.front().local_path;
-            url_tasks.front().on_done = [&]() { params.model.path = first_path; };
+            url_tasks.front().on_done = [&, first_path]() { params.model.path = first_path; };
        }
        for (auto & task : url_tasks) {
            tasks.push_back(std::move(task));
@@ -484,19 +496,22 @@ void common_models_handler_apply(common_models_handler & handler, common_params
    }

    // handle hf_plan tasks
-    if (!plan.model_files.empty()) {
-        for (size_t i = 0; i < plan.model_files.size(); ++i) {
-            auto & model_file = plan.model_files[i];
+    auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files, common_params_model & model) {
+        for (size_t i = 0; i < model_files.size(); ++i) {
+            auto & model_file = model_files[i];
            bool is_first = (i == 0);
            tasks.emplace_back(model_file, opts, [&, is_first]() {
                if (is_first) {
                    // only use first part as model path
-                    params.model.path = hf_cache::finalize_file(model_file);
+                    model.path = hf_cache::finalize_file(model_file);
                } else {
                    hf_cache::finalize_file(model_file);
                }
            });
        }
+    };
+    if (!plan.model_files.empty()) {
+        add_tasks(plan.model_files, params.model);
    }
    if (!plan.mmproj.local_path.empty()) {
        tasks.emplace_back(plan.mmproj, opts, [&]() {
@@ -522,9 +537,31 @@ void common_models_handler_apply(common_models_handler & handler, common_params
        });
    }

+    // handle plan_spec (e.g. --spec-draft-hf)
+    if (!plan_spec.model_files.empty()) {
+        add_tasks(plan_spec.model_files, params.speculative.draft.mparams);
+    }
+
+    // handle vocoder plan (e.g. --hf-repo-v)
+    if (!plan_voc.model_files.empty()) {
+        add_tasks(plan_voc.model_files, params.vocoder.model);
+    }
+
    // run all tasks in parallel
    if (!params.offline) {
-        common_download_run_tasks(tasks);
+        // if duplicated files are found, only download once (but still call on_done for each task)
+        std::unordered_map<std::string, common_download_task *> unique_tasks;
+        for (auto & task : tasks) {
+            auto it = unique_tasks.find(task.local_path);
+            if (it == unique_tasks.end()) {
+                unique_tasks[task.local_path] = &task;
+            }
+        }
+        std::vector<common_download_task> unique_tasks_vec;
+        for (auto & pair : unique_tasks) {
+            unique_tasks_vec.push_back(*pair.second);
+        }
+        common_download_run_tasks(unique_tasks_vec);
    }

    // download successful, update params with the downloaded paths
@@ -3434,7 +3471,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.offline = true;
        }
-    ).set_env("LLAMA_ARG_OFFLINE"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_OFFLINE"));
    add_opt(common_arg(
        {"-lv", "--verbosity", "--log-verbosity"}, "N",
        string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
@@ -3711,6 +3748,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
            params.speculative.draft.mparams.path = value;
+            params.speculative.draft.mparams.hf_file = value; // will be used if --spec-draft-hf is set
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
    add_opt(common_arg(
@@ -133,6 +133,8 @@ void common_params_add_preset_options(std::vector<common_arg> & args);

 struct common_models_handler {
    common_download_hf_plan plan;
+    common_download_hf_plan plan_spec;
+    common_download_hf_plan plan_voc;
    common_download_opts opts;
 };

@@ -225,7 +225,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
    }

    if (!SetPriorityClass(GetCurrentProcess(), p)) {
-        LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        COM_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
        return false;
    }

@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
    }

    if (setpriority(PRIO_PROCESS, 0, p) != 0) {
-        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+        COM_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
        return false;
    }
    return true;
@@ -284,14 +284,14 @@ void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_para

    if (n_set && n_set < cpuparams.n_threads) {
        // Not enough set bits, may experience performance issues.
-        LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+        COM_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
    }
 }

 bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
    size_t dash_loc = range.find('-');
    if (dash_loc == std::string::npos) {
-        LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        COM_ERR("%s", "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
        return false;
    }

@@ -303,7 +303,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
    } else {
        start_i = std::stoull(range.substr(0, dash_loc));
        if (start_i >= GGML_MAX_N_THREADS) {
-            LOG_ERR("Start index out of bounds!\n");
+            COM_ERR("%s", "Start index out of bounds!\n");
            return false;
        }
    }
@@ -313,7 +313,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
    } else {
        end_i = std::stoull(range.substr(dash_loc + 1));
        if (end_i >= GGML_MAX_N_THREADS) {
-            LOG_ERR("End index out of bounds!\n");
+            COM_ERR("%s", "End index out of bounds!\n");
            return false;
        }
    }
@@ -333,7 +333,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
    }

    size_t num_digits = mask.length() - start_i;
-    if (num_digits > 128) num_digits = 128;
+    num_digits = std::min<size_t>(num_digits, 128);

    size_t end_i = num_digits + start_i;

@@ -348,7 +348,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
        } else if (c >= 'A' && c <= 'F') {
            id -= 'A' - 10;
        } else {
-            LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            COM_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
            return false;
        }

@@ -379,21 +379,21 @@ void common_params_print_info(const common_params & params, bool print_devices)
 #else
    const char * build_type = " (debug)";
 #endif
-    LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
+    COM_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);

-    LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
+    COM_INF("%s: verbosity = %d (adjust with the `-lv N` CLI arg)\n", __func__, common_log_get_verbosity_thold());

    // device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device
    if (print_devices) {
-        LOG_INF("device_info:\n");
+        COM_TRC("%s", "device_info:\n");
        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
            auto * dev = ggml_backend_dev_get(i);
            size_t free, total;
            ggml_backend_dev_memory(dev, &free, &total);
-            LOG_INF("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+            COM_TRC("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
        }
    }
-    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    COM_TRC("%s\n", common_params_get_system_info(params).c_str());
 }

 std::string common_params_get_system_info(const common_params & params) {
@@ -660,7 +660,7 @@ void string_process_escapes(std::string & input) {
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
    const char * sep = strchr(data, '=');
    if (sep == nullptr || sep - data >= 128) {
-        LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
+        COM_ERR("%s: malformed KV override '%s'\n", __func__, data);
        return false;
    }
    llama_model_kv_override kvo;
@@ -683,20 +683,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
        } else if (std::strcmp(sep, "false") == 0) {
            kvo.val_bool = false;
        } else {
-            LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            COM_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
            return false;
        }
    } else if (strncmp(sep, "str:", 4) == 0) {
        sep += 4;
        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
        if (strlen(sep) > 127) {
-            LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+            COM_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
            return false;
        }
        strncpy(kvo.val_str, sep, 127);
        kvo.val_str[127] = '\0';
    } else {
-        LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
+        COM_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
        return false;
    }
    overrides.emplace_back(std::move(kvo));
@@ -1199,8 +1199,8 @@ common_init_result::common_init_result(common_params & params, bool model_only)
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory ...\n", __func__);
-        LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__);
+        COM_TRC("%s", "fitting params to device memory ...\n");
+        COM_TRC("%s", "(for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n");
        common_fit_params(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split,
            params.tensor_buft_overrides.data(),
@@ -1227,7 +1227,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)
        llama_adapter_lora_ptr lora;
        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
        if (lora == nullptr) {
-            LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
+            COM_ERR("failed to load lora adapter '%s'\n", la.path.c_str());
            pimpl->model.reset(model);
            return;
        }
@@ -1246,14 +1246,14 @@ common_init_result::common_init_result(common_params & params, bool model_only)
    common_init_sampler_from_model(model, params.sampling);

    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        COM_WRN("%s", "vocab does not have an EOS token, ignoring --ignore-eos\n");
        params.sampling.ignore_eos = false;
    }

    // initialize once
    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
        if (llama_vocab_is_eog(vocab, i)) {
-            LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            COM_TRC("added %s logit bias = %f\n", common_token_to_piece(vocab, i).c_str(), -INFINITY);
            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
        }
    }
@@ -1291,7 +1291,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)

    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        COM_ERR("failed to create context with model '%s'\n", params.model.path.c_str());
        return;
    }

@@ -1328,7 +1328,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode

    llama_model * model = res->model();
    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        COM_ERR("failed to load model '%s'\n", params.model.path.c_str());
        return res;
    }

@@ -1338,14 +1338,14 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode

    llama_context * lctx = res->context();
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        COM_ERR("failed to create context with model '%s'\n", params.model.path.c_str());
        return res;
    }

    const llama_vocab * vocab = llama_model_get_vocab(model);

    if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
-        LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
+        COM_WRN("%s", "KV cache shifting is not supported for this context, disabling KV cache shifting\n");
        params.ctx_shift = false;
    }

@@ -1374,7 +1374,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
        bool ok = true;

        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
+            COM_WRN("%s", "vocab does not have a  BOS token, reranking will not work\n");
            ok = false;
        }

@@ -1383,10 +1383,10 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
        bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;

        if (!has_eos && !has_sep && !has_rerank_prompt) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
+            COM_WRN("%s", "vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n");
            ok = false;
        } else if (!has_eos) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
+            COM_WRN("%s", "vocab does not have an EOS token, using SEP token as fallback\n");
        }

        if (!ok) {
@@ -1399,7 +1399,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
    }

    if (params.warmup) {
-        LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
+        COM_TRC("%s", "warming up the model with an empty run - please wait ... (--no-warmup to disable)\n");

        std::vector<llama_token> tmp;
        llama_token bos = llama_vocab_bos(vocab);
@@ -1473,20 +1473,20 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {

    int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
    if (ret != 0) {
-        LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
+        COM_ERR("llama_decode() failed: %d\n", ret);
        res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
        goto done;
    }

    if (llama_n_rs_seq(ctx) > 0) {
-        LOG_INF("%s: the context supports bounded partial sequence removal\n", __func__);
+        COM_TRC("%s", "the context supports bounded partial sequence removal\n");
        res = COMMON_CONTEXT_SEQ_RM_TYPE_RS;
        goto done;
    }

    // try to remove the last tokens
    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        LOG_TRC("%s: the context does not support partial sequence removal\n", __func__);
+        COM_TRC("%s", "the context does not support partial sequence removal\n");
        res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
        goto done;
    }
@@ -1803,13 +1803,13 @@ static common_control_vector_data common_control_vector_load_one(const common_co
    };
    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
    if (!ctx_gguf) {
-        LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+        COM_ERR("failed to load control vector file from %s\n", load_info.fname.c_str());
        return result;
    }

    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
    if (n_tensors == 0) {
-        LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
+        COM_WRN("no direction tensors found in %s\n", load_info.fname.c_str());
    }

    for (int i = 0; i < n_tensors; i++) {
@@ -1827,23 +1827,23 @@ static common_control_vector_data common_control_vector_load_one(const common_co
            }
        }
        if (layer_idx < 0) {
-            LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            COM_ERR("invalid/unparsable direction tensor layer index in %s\n", load_info.fname.c_str());
            result.n_embd = -1;
            break;
        } else if (layer_idx == 0) {
-            LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            COM_ERR("invalid (zero) direction tensor layer index in %s\n", load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }

        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
        if (tensor->type != GGML_TYPE_F32) {
-            LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+            COM_ERR("invalid (non-F32) direction tensor type in %s\n", load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
        if (ggml_n_dims(tensor) != 1) {
-            LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+            COM_ERR("invalid (non-1D) direction tensor shape in %s\n", load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
@@ -1851,7 +1851,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
        if (result.n_embd == -1) {
            result.n_embd = ggml_nelements(tensor);
        } else if (ggml_nelements(tensor) != result.n_embd) {
-            LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+            COM_ERR("direction tensor in %s does not match previous dimensions\n", load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
@@ -1868,7 +1868,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
    }

    if (result.n_embd == -1) {
-        LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+        COM_WRN("skipping %s due to invalid direction tensors\n", load_info.fname.c_str());
        result.data.clear();
    }

@@ -1889,7 +1889,7 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
            break;
        }
        if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
-            LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+            COM_ERR("control vectors in %s does not match previous dimensions\n", info.fname.c_str());
            result.n_embd = -1;
            break;
        }
@@ -1905,7 +1905,7 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
    }

    if (result.n_embd == -1) {
-        LOG_ERR("%s: no valid control vector files passed\n", __func__);
+        COM_ERR("%s", "no valid control vector files passed\n");
        result.data.clear();
    }

@@ -2016,13 +2016,13 @@ bool common_prompt_batch_decode(
        // memory, so we can't just remove the last token from the memory and replay the last token which
        // is the reason for this logic.
        if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(all_tokens.data() + offset), n_tokens_before_last))) {
-            LOG_ERR("%s : failed to eval\n", __func__);
+            COM_ERR("%s", "failed to eval\n");
            return false;
        }
        n_past += n_tokens_before_last;

        llama_state_save_file(ctx, state_path.data(), all_tokens.data(), all_tokens.size());
-        LOG_INF("saved session before last token to %s, n_new = %zu\n", state_path.data(), all_tokens.size());
+        COM_INF("saved session before last token to %s, n_new = %zu\n", state_path.data(), all_tokens.size());

        llama_token last_token = all_tokens.back();
        llama_batch batch = llama_batch_get_one(&last_token, 1);
@@ -2030,13 +2030,13 @@ bool common_prompt_batch_decode(
        batch.pos = &pos;

        if (llama_decode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval last token\n", __func__);
+            COM_ERR("%s", "failed to eval last token\n");
            return false;
        }
        n_past++;
    } else {
        if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(all_tokens.data() + offset), n_new))) {
-            LOG_ERR("%s : failed to eval\n", __func__);
+            COM_ERR("%s", "failed to eval\n");
            return false;
        }
        n_past += n_new;
@@ -25,6 +25,13 @@
 #define DIRECTORY_SEPARATOR '/'
 #endif // _WIN32

+#define COM_DBG(fmt, ...) LOG_DBG("cmn  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define COM_TRC(fmt, ...) LOG_TRC("cmn  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define COM_INF(fmt, ...) LOG_INF("cmn  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define COM_WRN(fmt, ...) LOG_WRN("cmn  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define COM_ERR(fmt, ...) LOG_ERR("cmn  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define COM_CNT(fmt, ...) LOG_CNT(""              fmt,               __VA_ARGS__)
+
 #define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

@@ -233,7 +233,7 @@ static void common_params_fit_impl(
        sum_projected_used = dmds_full.back().mb.total();
        sum_free           = dmds_full.back().total;
        sum_projected_free = sum_free - sum_projected_used;
-        LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
+        LOG_TRC("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
            __func__, sum_projected_used/MiB, sum_free/MiB);
        if (sum_projected_free >= margins[0]) {
            LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
@@ -65,12 +65,12 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
            if (ctx->start_matcher.advance(token)) {
                ctx->state = REASONING_BUDGET_COUNTING;
                ctx->remaining = ctx->budget;
-                LOG_INF("reasoning-budget: activated, budget=%d tokens\n", ctx->budget);
+                COM_TRC("activated, budget=%d tokens\n", ctx->budget);

                if (ctx->remaining <= 0) {
                    ctx->state = REASONING_BUDGET_FORCING;
                    ctx->force_pos = 0;
-                    LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
+                    COM_TRC("%s", "budget=0, forcing immediately\n");
                }
            }
            break;
@@ -80,7 +80,7 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
        {
            if (ctx->end_matcher.advance(token)) {
                ctx->state = REASONING_BUDGET_DONE;
-                LOG_INF("reasoning-budget: deactivated (natural end)\n");
+                COM_TRC("%s", "deactivated (natural end)\n");
                break;
            }

@@ -95,7 +95,7 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
                    ctx->state = REASONING_BUDGET_FORCING;
                    ctx->force_pos = 0;
                    ctx->end_matcher.reset();
-                    LOG_INF("reasoning-budget: UTF-8 complete, now forcing end sequence\n");
+                    COM_TRC("%s", "UTF-8 complete, now forcing end sequence\n");
                }
            } else if (ctx->state == REASONING_BUDGET_COUNTING) {
                ctx->remaining--;
@@ -104,11 +104,11 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
                        ctx->state = REASONING_BUDGET_FORCING;
                        ctx->force_pos = 0;
                        ctx->end_matcher.reset();
-                        LOG_INF("reasoning-budget: budget exhausted, forcing end sequence\n");
+                        COM_TRC("%s", "budget exhausted, forcing end sequence\n");
                    } else {
                        ctx->state = REASONING_BUDGET_WAITING_UTF8;
                        ctx->end_matcher.reset();
-                        LOG_INF("reasoning-budget: budget exhausted, waiting for UTF-8 completion\n");
+                        COM_TRC("%s", "budget exhausted, waiting for UTF-8 completion\n");
                    }
                }
            }
@@ -118,7 +118,7 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
            ctx->force_pos++;
            if (ctx->force_pos >= ctx->forced_tokens.size()) {
                ctx->state = REASONING_BUDGET_DONE;
-                LOG_INF("reasoning-budget: forced sequence complete, done\n");
+                COM_TRC("%s", "forced sequence complete, done\n");
            }
            break;
        case REASONING_BUDGET_DONE:
@@ -128,12 +128,12 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
                ctx->state = REASONING_BUDGET_COUNTING;
                ctx->remaining = ctx->budget;
                ctx->end_matcher.reset();
-                LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget);
+                COM_TRC("re-activated on new start tag, budget=%d tokens\n", ctx->budget);

                if (ctx->remaining <= 0) {
                    ctx->state = REASONING_BUDGET_FORCING;
                    ctx->force_pos = 0;
-                    LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
+                    COM_TRC("%s", "budget=0, forcing immediately\n");
                }
            }
            break;
@@ -264,7 +264,7 @@ bool common_reasoning_budget_force(struct llama_sampler * smpl) {
    ctx->state = REASONING_BUDGET_FORCING;
    ctx->force_pos = 0;
    ctx->end_matcher.reset();
-    LOG_INF("reasoning-budget: forced into forcing state (manual transition)\n");
+    COM_TRC("%s", "forced into forcing state (manual transition)\n");

    return true;
 }
@@ -18,6 +18,13 @@
 #include <map>
 #include <cinttypes>

+#define SPC_DBG(fmt, ...) LOG_DBG("spec %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SPC_TRC(fmt, ...) LOG_TRC("spec %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SPC_INF(fmt, ...) LOG_INF("spec %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SPC_WRN(fmt, ...) LOG_WRN("spec %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SPC_ERR(fmt, ...) LOG_ERR("spec %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SPC_CNT(fmt, ...) LOG_CNT(""              fmt,               __VA_ARGS__)
+
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5

@@ -60,21 +67,20 @@ static bool common_speculative_are_compatible(
    const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);

    const auto vocab_type_tgt = llama_vocab_type(vocab_tgt);
-    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
+    SPC_DBG("vocab_type tgt: %d\n", vocab_type_tgt);

    const auto vocab_type_dft = llama_vocab_type(vocab_dft);
-    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
+    SPC_DBG("vocab_type dft: %d\n", vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
-        LOG_WRN("%s: draft model vocab type must match target model to use speculation but "
-                "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
+        SPC_WRN("draft model vocab type must match target model to use speculation but "
+                "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
        return false;
    }

    if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
        (llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
-        LOG_WRN("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
-                __func__,
+        SPC_WRN("draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
                llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
                llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
        return false;
@@ -82,8 +88,7 @@ static bool common_speculative_are_compatible(

    if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
        (llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
-        LOG_WRN("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
-                __func__,
+        SPC_WRN("draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
                llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
                llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
        return false;
@@ -97,8 +102,8 @@ static bool common_speculative_are_compatible(
            : n_vocab_dft - n_vocab_tgt;

        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
-            LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+            SPC_DBG("draft model vocab must closely match target model to use speculation but "
+                    "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
                    n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
            return false;
        }
@@ -108,8 +113,8 @@ static bool common_speculative_are_compatible(
            const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);

            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
-                LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
+                SPC_DBG("draft model vocab must match target model to use speculation but "
+                        "token %d content differs - target '%s', draft '%s'\n", i,
                        common_token_to_piece(vocab_tgt, i).c_str(),
                        common_token_to_piece(vocab_dft, i).c_str());
                return false;
@@ -186,9 +191,9 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
        auto * ctx_dft = this->params.ctx_dft;
        auto * ctx_tgt = this->params.ctx_tgt;

-        LOG_INF("%s: adding speculative implementation 'draft-simple'\n", __func__);
-        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min);
-        LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
+        SPC_TRC("%s", "adding speculative implementation 'draft-simple'\n");
+        SPC_TRC("- n_max=%d, n_min=%d, p_min=%f\n", this->params.n_max, this->params.n_min, this->params.p_min);
+        SPC_TRC("- gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n",
                this->params.n_gpu_layers,
                ggml_type_name(this->params.cache_type_k),
                ggml_type_name(this->params.cache_type_v),
@@ -228,16 +233,16 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
        }

        const bool vocab_cmpt = common_speculative_are_compatible(llama_get_model(ctx_tgt), llama_get_model(ctx_dft));
-        LOG_DBG("%s: vocab_cmpt = %d\n", __func__, vocab_cmpt);
+        SPC_DBG("vocab_cmpt = %d\n", vocab_cmpt);

        if (!vocab_cmpt) {
-            LOG_ERR("%s: the target and draft vocabs are not compatible\n", __func__);
+            SPC_ERR("%s", "the target and draft vocabs are not compatible\n");

            throw std::runtime_error("draft model vocab type must match target model to use speculation");
        }

        if (n_seq != llama_n_seq_max(ctx_dft)) {
-            LOG_ERR("%s: n_seq mismatch: %d != %d\n", __func__, n_seq, llama_n_seq_max(ctx_dft));
+            SPC_ERR("n_seq mismatch: %d != %d\n", n_seq, llama_n_seq_max(ctx_dft));

            throw std::runtime_error("the draft model number of sequences is incompatible with the speculative n_seq");
        }
@@ -257,7 +262,7 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
        const int ret = llama_decode(ctx_dft, batch);

        if (ret != 0) {
-            LOG_ERR("%s: failed to decode draft batch, ret = %d\n", __func__, ret);
+            SPC_ERR("failed to decode draft batch, ret = %d\n", ret);

            return false;
        }
@@ -290,7 +295,7 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {

        int ret = llama_decode(ctx_dft, batch);
        if (ret != 0) {
-            LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
+            SPC_ERR("llama_decode returned %d\n", ret);
            return;
        }

@@ -314,7 +319,7 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
                const auto * cur_p = common_sampler_get_candidates(smpl, true);

                for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
-                    LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    SPC_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
                            seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p,
                            common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                }
@@ -354,7 +359,7 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
            // evaluate the drafted tokens on the draft model
            ret = llama_decode(ctx_dft, batch);
            if (ret != 0) {
-                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
+                SPC_ERR("llama_decode[%d] returned %d\n", i, ret);
                break;
            }

@@ -449,8 +454,8 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
        , params(params.draft)
    {
-        LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
-        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f, backend_sampling=%d\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min, (int) params.draft.backend_sampling);
+        SPC_TRC("%s", "adding speculative implementation 'draft-eagle3'\n");
+        SPC_TRC("- n_max=%d, n_min=%d, p_min=%f, backend_sampling=%d\n", params.draft.n_max, params.draft.n_min, params.draft.p_min, (int) params.draft.backend_sampling);

        auto * ctx_tgt = this->params.ctx_tgt;
        auto * ctx_dft = this->params.ctx_dft;
@@ -493,7 +498,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
                llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));

                if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
-                    LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
+                    SPC_WRN("backend offload failed for seq_id=%d; using CPU sampler\n", (int) seq_id);
                    llama_sampler_free(chain);
                    chain = nullptr;
                }
@@ -548,9 +553,9 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
        auto * ctx_dft = this->params.ctx_dft;
        const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
        if (pos_max < N - 2) {
-            LOG_WRN("%s: ctx_dft pos_max=%d < N-2=%d — process() did not run on every prefill ubatch. "
+            SPC_WRN("ctx_dft pos_max=%d < N-2=%d — process() did not run on every prefill ubatch. "
                    "Drafts may degrade.\n",
-                    __func__, (int) pos_max, N - 2);
+                    (int) pos_max, N - 2);
        }
    }

@@ -621,8 +626,8 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
            };
            const int32_t rc = llama_encode(ctx_dft, enc_batch);
            if (rc != 0) {
-                LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n",
-                        __func__, rc, (int) n_chunk, (int) i);
+                SPC_ERR("llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n",
+                        rc, (int) n_chunk, (int) i);
                return false;
            }

@@ -692,8 +697,8 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
        if (batch.n_tokens > 0) {
            const int32_t rc = llama_decode(ctx_dft, batch);
            if (rc != 0) {
-                LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n",
-                        __func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]);
+                SPC_ERR("llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n",
+                        rc, (int) batch.n_tokens, (int) batch_in.pos[0]);
                return false;
            }
        }
@@ -744,7 +749,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {

        int ret = llama_decode(ctx_dft, batch);
        if (ret != 0) {
-            LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
+            SPC_ERR("llama_decode returned %d\n", ret);
            return;
        }

@@ -770,7 +775,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
                const auto * cur_p = common_sampler_get_candidates(smpl, true);

                for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
-                    LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    SPC_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
                            seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p,
                            common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                }
@@ -809,7 +814,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {

            ret = llama_decode(ctx_dft, batch);
            if (ret != 0) {
-                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
+                SPC_ERR("llama_decode[%d] returned %d\n", i, ret);
                break;
            }

@@ -942,9 +947,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                "MTP input row width must match the target h_nextn width");
        n_mtp_layers = std::max(1, (int) llama_model_n_layer_nextn(llama_get_model(ctx_dft)));

-        LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
-        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
-        LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
+        SPC_TRC("%s", "adding speculative implementation 'draft-mtp'\n");
+        SPC_TRC("- n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
+        SPC_TRC("- gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n",
                this->params.n_gpu_layers,
                ggml_type_name(this->params.cache_type_k),
                ggml_type_name(this->params.cache_type_v),
@@ -975,7 +980,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));

                if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
-                    LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
+                    SPC_WRN("backend offload failed for seq_id=%d; using CPU sampler\n", (int) seq_id);
                    llama_sampler_free(chain);
                    chain = nullptr;
                }
@@ -1038,11 +1043,11 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);

        if (pos_max < N - 1 && !is_mem_shared) {
-            LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - "
+            SPC_WRN("ctx_dft pos_max=%d < N-1=%d - "
                    "process() hook may not have run on every prefill ubatch "
                    "(need_embd / logits=1 on every prompt position?). "
                    "Drafts may degrade.\n",
-                    __func__, (int) pos_max, N - 1);
+                    (int) pos_max, N - 1);
        }
    }

@@ -1128,8 +1133,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

                const int32_t rc = llama_decode(ctx_dft, batch);
                if (rc != 0) {
-                    LOG_ERR("%s: llama_decode(ctx_dft) head=%d failed rc=%d (pos=%d)\n",
-                            __func__, head, (int) rc, (int) batch_in.pos[0]);
+                    SPC_ERR("llama_decode(ctx_dft) head=%d failed rc=%d (pos=%d)\n",
+                            head, (int) rc, (int) batch_in.pos[0]);
                    ok = false;
                    break;
                }
@@ -1217,7 +1222,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

            int ret = llama_decode(ctx_dft, batch);
            if (ret != 0) {
-                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
+                SPC_ERR("llama_decode[%d] returned %d\n", i, ret);
                break;
            }

@@ -1239,7 +1244,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                const auto * cur_p = common_sampler_get_candidates(smpl, true);

                for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
-                    LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    SPC_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
                            seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p,
                            common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                }
@@ -1353,8 +1358,8 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl {
        , params(params.ngram_simple)
        , config(config)
    {
-        LOG_INF("%s: adding speculative implementation 'ngram-simple'\n", __func__);
-        LOG_INF("%s: - size_n=%d, size_m=%d, min_hits=%d\n", __func__,
+        SPC_TRC("%s", "adding speculative implementation 'ngram-simple'\n");
+        SPC_TRC("- size_n=%d, size_m=%d, min_hits=%d\n",
                this->params.size_n, this->params.size_m, this->params.min_hits);
    }

@@ -1403,8 +1408,8 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl {
            this->config.push_back(config);
        }

-        LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(this->type).c_str());
-        LOG_INF("%s: - size_key=%d, size_value=%d, key_only=%d, min_hits=%d\n", __func__,
+        SPC_TRC("adding speculative implementation '%s'\n", common_speculative_type_to_str(this->type).c_str());
+        SPC_TRC("- size_key=%d, size_value=%d, key_only=%d, min_hits=%d\n",
                config.size_key, config.size_value, config.key_only, config.min_hits);
    }

@@ -1478,15 +1483,15 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
        , verbose(std::getenv("LLAMA_TRACE") != nullptr) {
        static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));

-        LOG_INF("%s: adding speculative implementation 'ngram-mod'\n", __func__);
-        LOG_INF("%s: - n_match=%d, n_max=%d, n_min=%d\n", __func__,
+        SPC_TRC("%s", "adding speculative implementation 'ngram-mod'\n");
+        SPC_TRC("- n_match=%d, n_max=%d, n_min=%d\n",
                this->params.n_match, this->params.n_max, this->params.n_min);
-        LOG_INF("%s: - mod size=%zu (%.3f MB)\n", __func__,
+        SPC_TRC("- mod size=%zu (%.3f MB)\n",
                mod.size(), (float)(mod.size_bytes())/1024/1024);

        if (this->params.n_match < 16) {
-            LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, "
-                    "see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, this->params.n_match);
+            SPC_WRN("ngram_mod n_match=%d is too small - poor quality is possible, "
+                    "see: https://github.com/ggml-org/llama.cpp/pull/19164\n", this->params.n_match);
        }

        sinfos.resize(n_seq);
@@ -1510,11 +1515,11 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
        sinfo.i_last = prompt.size() - n;

        const double f = (double)mod.get_used() / (double)mod.size();
-        LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f);
+        SPC_TRC("ngram_mod occupancy = %zu/%zu (%.2f)\n", mod.get_used(), mod.size(), f);

        constexpr double f_thold = 0.25;
        if (f > f_thold) {
-            LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold);
+            SPC_WRN("ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", f, f_thold);

            mod.reset();
        }
@@ -1608,7 +1613,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
                sinfo.n_low++;
                if (sinfo.n_low >= 5) {
                    if (verbose) {
-                        LOG_WRN("%s: low acceptance streak (%d) - resetting ngram_mod\n", __func__, sinfo.n_low);
+                        SPC_TRC("low acceptance streak (%d) - resetting ngram_mod\n", sinfo.n_low);
                    }

                    mod.reset();
@@ -1658,8 +1663,8 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl {
        , save_dynamic(save_dynamic)
        , save_static(save_static)
    {
-        LOG_INF("%s: adding speculative implementation 'ngram-cache'\n", __func__);
-        LOG_INF("%s: - n_draft=%d, cache_static=%s, cache_dynamic=%s\n", __func__,
+        SPC_TRC("%s", "adding speculative implementation 'ngram-cache'\n");
+        SPC_TRC("- n_draft=%d, cache_static=%s, cache_dynamic=%s\n",
                n_draft,
                path_static.empty() ? "none" : path_static.c_str(),
                path_dynamic.empty() ? "none" : path_dynamic.c_str());
@@ -1674,7 +1679,7 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl {
                    sinfo.ngram_cache_static = ngram_cache_static;
                }
            } catch (...) {
-                LOG_ERR("failed to open static lookup cache: %s", path_static.c_str());
+                SPC_ERR("failed to open static lookup cache: %s", path_static.c_str());
                GGML_ABORT("Couldn't read static lookup cache");
            }
        }
@@ -1687,7 +1692,7 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl {
                    sinfo.ngram_cache_dynamic = ngram_cache_dynamic;
                }
            } catch (...) {
-                LOG_ERR("failed to open dynamic lookup cache: %s", path_dynamic.c_str());
+                SPC_ERR("failed to open dynamic lookup cache: %s", path_dynamic.c_str());
                GGML_ABORT("Couldn't read dynamic lookup cache");
            }
        }
@@ -2034,7 +2039,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
    }

    if (impls.empty()) {
-        LOG_WRN("%s: no implementations specified for speculative decoding\n", __func__);
+        SPC_TRC("%s", "no implementations specified for speculative decoding\n");
        return nullptr;
    }

@@ -2161,13 +2166,13 @@ void common_speculative_draft(common_speculative * spec) {

                if (dp.n_max > 0) {
                    if (!result.empty() && (int) result.size() > dp.n_max) {
-                        LOG_DBG("%s: truncating draft to %d tokens\n", __func__, dp.n_max);
+                        SPC_DBG("truncating draft to %d tokens\n", dp.n_max);
                        result.resize(dp.n_max);
                    }
                }

                if (!result.empty()) {
-                    LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
+                    SPC_DBG("called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n",
                            common_speculative_type_to_str(impl.get()->type).c_str(), dp.prompt->size(),
                            impl.get()->n_call_draft, result.size());

@@ -2291,7 +2296,7 @@ void common_speculative_print_stats(const common_speculative * spec) {
            str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")";
        }

-        LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n",
+        SPC_TRC("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n",
                common_speculative_type_to_str(impl->type).c_str(),
                impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
                impl->n_gen_drafts,
@@ -114,7 +114,8 @@ class Mamba2Model(TextModel):
            hparams["text_config"] = hparams["llm_config"]
        super().__init__(dir_model, *args, hparams=hparams, **kwargs)
        self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
-        self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
+        self.expand = self.find_hparam(["mamba_expand", "expand"], optional=True) or 2
+        self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or self.expand * self.d_model
        self.n_group = self.find_hparam(["n_groups"], optional=True) or 1

    def set_vocab(self):
@@ -144,11 +145,9 @@ class Mamba2Model(TextModel):

        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5

-        # Fail early for models which don't have a block expansion factor of 2
-        # TODO: does this really matter?
        # skip the assertion for FalconH1 Model
        if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
-            assert self.d_inner == 2 * self.d_model
+            assert self.d_inner == self.expand * self.d_model
            assert self.d_inner % head_dim == 0

        self.gguf_writer.add_context_length(2**20)  # arbitrary value; for those who use the default
@@ -237,8 +237,8 @@ chmod +x ubuntu-llamacpp-ov-install.sh
 # ============================================
 set -euo pipefail

-OPENVINO_VERSION_MAJOR="2026.2"
-OPENVINO_VERSION_FULL="2026.2.0.21903.52ddc073857"
+OPENVINO_VERSION_MAJOR="2026.2.1"
+OPENVINO_VERSION_FULL="2026.2.1.21919.ede283a88e3"

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 OPENVINO_INSTALL_DIR="/opt/intel/openvino_${OPENVINO_VERSION_MAJOR}"
@@ -334,7 +334,7 @@ echo "  ./build/ReleaseOV/bin/llama-cli -m model.gguf"
 ```

 > [!NOTE]
-> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
+> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.

 </details>

@@ -364,8 +364,8 @@ REM ============================================
 REM llama.cpp OpenVINO Build Script (Ninja)
 REM ============================================

-set "OPENVINO_VERSION_MAJOR=2026.2"
-set "OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857"
+set "OPENVINO_VERSION_MAJOR=2026.2.1"
+set "OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3"

 set "SCRIPT_DIR=%~dp0"
 set "VCPKG_DIR=C:\vcpkg"
@@ -547,7 +547,7 @@ endlocal
 ```

 > [!NOTE]
-> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
+> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.

 </details>

@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 15)
-set(GGML_VERSION_PATCH 2)
+set(GGML_VERSION_PATCH 3)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -1551,6 +1551,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
        int split_backend_id = split->backend_id;
        ggml_backend_t split_backend = sched->backends[split_backend_id];

+        ggml_backend_synchronize(split_backend);
+
        // copy the input tensors to the split backend
        for (int input_id = 0; input_id < split->n_inputs; input_id++) {
            ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
@@ -1561,15 +1563,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
-                } else {
+                } else if (!split_backend->iface.cpy_tensor_async) {
                    ggml_backend_synchronize(split_backend);
                }
-                ggml_backend_tensor_copy(input, input_cpy);
+                ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
            } else {
                // wait for the split backend to finish using the input before overwriting it
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
-                } else {
+                } else if (!split_backend->iface.cpy_tensor_async) {
                    ggml_backend_synchronize(split_backend);
                }

@@ -1674,6 +1676,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
            }
        }

+        ggml_backend_synchronize(split_backend);
+
        if (!sched->callback_eval) {
            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
            if (ec != GGML_STATUS_SUCCESS) {
@@ -75,12 +75,12 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
            ay1 = GGML_F32_VEC_LOAD(y + i);
            sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
        }
-        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
+        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmla on available elements only
        if (np2 < n) {
            svbool_t pg = svwhilelt_b32(np2, n);
            ax1 = svld1_f32(pg, x + np2);
            ay1 = svld1_f32(pg, y + np2);
-            sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
+            sum1 = svmla_f32_m(pg, sum1, ax1, ay1);
        }
        // reduce sum1,sum2 to sum1
        GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
@@ -386,6 +386,46 @@ static void ggml_cpy_f32_iq4_nl_cuda(
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

+// check if a same-type copy reduces to a 2D strided copy (height rows of width
+// contiguous bytes), so it can use cudaMemcpy2DAsync instead of the scalar kernel
+static bool ggml_cuda_cpy_as_memcpy_2d(const ggml_tensor * src0, const ggml_tensor * src1,
+        size_t & width, size_t & height, size_t & spitch, size_t & dpitch) {
+    // require matching shape: a reshaped copy maps elements by flat order, which the
+    // prefix walk below does not handle
+    if (src0->type != src1->type || !ggml_are_same_shape(src0, src1)) {
+        return false;
+    }
+
+    // grow the contiguous prefix block shared by both tensors
+    size_t block_nb = ggml_element_size(src0);
+    int d = 0;
+    for (; d < GGML_MAX_DIMS; ++d) {
+        if (src0->nb[d] != block_nb || src1->nb[d] != block_nb) {
+            break;
+        }
+        block_nb *= src0->ne[d];
+    }
+
+    // d == 0: nothing contiguous; d == GGML_MAX_DIMS: fully contiguous (handled by memcpy)
+    if (d == 0 || d == GGML_MAX_DIMS) {
+        return false;
+    }
+
+    // dim d carries the rows; everything above it must be a single element
+    for (int i = d + 1; i < GGML_MAX_DIMS; ++i) {
+        if (src0->ne[i] != 1) {
+            return false;
+        }
+    }
+
+    width  = block_nb;
+    height = src0->ne[d];
+    spitch = src0->nb[d];
+    dpitch = src1->nb[d];
+
+    return spitch >= width && dpitch >= width;
+}
+
 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));
@@ -421,6 +461,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
    const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) &&
        src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);

+    size_t mc_width = 0, mc_height = 0, mc_spitch = 0, mc_dpitch = 0;
+
    if (src0->type == src1->type && contiguous_srcs) {
        GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
 #if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
@@ -431,6 +473,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
        {
            CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
        }
+    } else if (ggml_cuda_cpy_as_memcpy_2d(src0, src1, mc_width, mc_height, mc_spitch, mc_dpitch)) {
+        CUDA_CHECK(cudaMemcpy2DAsync(src1_ddc, mc_dpitch, src0_ddc, mc_spitch,
+                                     mc_width, mc_height, cudaMemcpyDeviceToDevice, main_stream));
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
        if (can_be_transposed) {
            ggml_cpy_scalar_cuda<float, float, true>
@@ -3192,11 +3192,24 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;

-    if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
+    // Enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
+    // Excluding this path for HIP and MUSA as a precaution.
+    // According to the summary in https://github.com/ggml-org/llama.cpp/pull/20793#issuecomment-4275794315, this change is not beneficial for hip anyways.
+    // Additionally, there is a lot of anectodal evidence that hip/musa stream behavior might not always 1:1 match CUDA behavior.
+    // e.g. https://github.com/ROCm/rocm-systems/issues/5109
+    // It thus makes sense to exclude this path for HIP and MUSA. This PR was not aimed these backends, the majority of testing happened on CUDA.
+    // This can be revisited in the future if enabling copy_from_host benefits hip/MUSA, and if the PR author can extensively test on these backends.
+#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
+    const bool copy_from_host = false;
+#else
+    const bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
+#endif
+
+    if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
        return false;
    }

-    if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
+    if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(buf_dst)) {
        return false;
    }

@@ -3207,14 +3220,17 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
    ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
    ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;

-    if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
+    if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
+        !copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
 #endif // NDEBUG
        return false;
    }

-    if (backend_src != backend_dst) {
+    if (copy_from_host) {
+        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
+    } else if (backend_src != backend_dst) {
        // copy on src stream
        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -2,6 +2,28 @@

 #include <cstdint>

+static __global__ void k_compute_out_prod_ptrs(
+        const float * src0_d, const float * src1_d, float * dst_d,
+        const float ** ptrs_a, const float ** ptrs_b, float ** ptrs_c,
+        const int64_t ne2, const int64_t ne3,
+        const int64_t dps2, const int64_t dps3,
+        const size_t s02, const size_t s03,
+        const size_t s12, const size_t s13,
+        const size_t s2,  const size_t s3) {
+    const int64_t i2 = blockIdx.x*blockDim.x + threadIdx.x;
+    const int64_t i3 = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int64_t idx = i3*ne2 + i2;
+
+    ptrs_a[idx] = src0_d + (i3/dps3)*s03 + (i2/dps2)*s02;
+    ptrs_b[idx] = src1_d +  i3      *s13 +  i2      *s12;
+    ptrs_c[idx] = dst_d  +  i3      *s3  +  i2      *s2;
+}
+
 void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
@@ -67,18 +89,39 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
                        &beta,  dst_d  +  i3     *s3,  ldc, s2,
                        batch_count));
        }
+    } else if (ne2 > 1 || ne3 > 1) {
+        // dps2 > 1 (src0 broadcast along dim 2 with non-uniform stride) or multiple GEMMs
+        // along dim 3: compute per-GEMM pointers on the device and use a single batched GEMM.
+        GGML_ASSERT(ne3 > 0);
+        GGML_ASSERT(ne2 <= (int64_t) std::numeric_limits<int>::max() / ne3);
+        const int batch_count = (int) (ne2 * ne3);
+
+        ggml_cuda_pool_alloc<const float *> ptrs_a(ctx.pool(), batch_count);
+        ggml_cuda_pool_alloc<const float *> ptrs_b(ctx.pool(), batch_count);
+        ggml_cuda_pool_alloc<      float *> ptrs_c(ctx.pool(), batch_count);
+
+        const dim3 block_dims(16, 16);
+        const dim3 grid_dims((ne2 + block_dims.x - 1)/block_dims.x, (ne3 + block_dims.y - 1)/block_dims.y);
+        k_compute_out_prod_ptrs<<<grid_dims, block_dims, 0, stream>>>(
+            src0_d, src1_d, dst_d,
+            ptrs_a.get(), ptrs_b.get(), ptrs_c.get(),
+            ne2, ne3, dps2, dps3, s02, s03, s12, s13, s2, s3);
+        CUDA_CHECK(cudaGetLastError());
+
+        CUBLAS_CHECK(
+            cublasSgemmBatched(handle, CUBLAS_OP_N, src1_cublas_op,
+                    ne0, ne1, ne01,
+                    &alpha, ptrs_a.get(), lda,
+                            ptrs_b.get(), ldb,
+                    &beta,  ptrs_c.get(), ldc,
+                    batch_count));
    } else {
-        // Fallback: ne2 == 1 (no batching benefit) or dps2 > 1 (src0 broadcast along dim 2
-        // with non-uniform stride; would need cublasSgemmBatched with pointer arrays).
-        for (int64_t i3 = 0; i3 < ne3; ++i3) {
-            for (int64_t i2 = 0; i2 < ne2; ++i2) {
-                CUBLAS_CHECK(
-                    cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
-                            ne0, ne1, ne01,
-                            &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
-                                    src1_d +  i3      *s13 +  i2      *s12, ldb,
-                            &beta,  dst_d  +  i3      *s3  +  i2      *s2,  ldc));
-            }
-        }
+        // ne2 == 1 && ne3 == 1: single GEMM
+        CUBLAS_CHECK(
+            cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
+                    ne0, ne1, ne01,
+                    &alpha, src0_d, lda,
+                            src1_d, ldb,
+                    &beta,  dst_d,  ldc));
    }
 }
@@ -48,6 +48,7 @@
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
+#define cublasSgemmBatched hipblasSgemmBatched
 #define cublasSgemmStridedBatched hipblasSgemmStridedBatched
 #define cublasStatus_t hipblasStatus_t
 #define cublasOperation_t hipblasOperation_t
@@ -32,6 +32,7 @@
 #define cublasSetMathMode mublasSetMathMode
 #define cublasSetStream mublasSetStream
 #define cublasSgemm mublasSgemm
+#define cublasSgemmBatched mublasSgemmBatched
 #define cublasSgemmStridedBatched mublasSgemmStridedBatched
 #define cublasStatus_t mublasStatus_t
 #define cublasOperation_t mublasOperation_t
@@ -192,7 +192,10 @@ set(GGML_OPENCL_KERNELS
    mul_mm_f16_f32_kq_kqv
    conv2d
    conv2d_f16_f32
+    flash_attn_pre_f16
    flash_attn_f32_f16
+    flash_attn_f32_q8_0
+    flash_attn_f32_q4_0
    flash_attn_f16
    flash_attn_f32
 )
@@ -0,0 +1,91 @@
+#pragma once
+
+// Flash-attention per-(dk,dv) tile tuning for the Adreno OpenCL backend.
+// Isolated from ggml-opencl.cpp so the tuning numbers are easy to find and
+// edit; the FA dispatch and kernel-compile logic stay in the main file.
+// This header is a file section — it is #included exactly once, at the point
+// in ggml-opencl.cpp where the ggml logging macros are already in scope.
+
+// Per-(dk, dv) FA config; shared by dispatch and supports_op.
+struct ggml_opencl_fa_dim {
+    int dk; int dv; int bm; int bn; int n_split; int nkv_split_threshold;
+};
+
+// Split variant fires when n_kv >= threshold (threshold=0 -> always split).
+// Default tuning covers Adreno 7xx/8xx mobile and X1-series laptop GPUs.
+static const ggml_opencl_fa_dim g_fa_dims_adreno_default[] = {
+    { 40,  40, 64, 32, 1, 0}, { 64,  64, 64, 32, 2, 64},
+    { 80,  80, 64, 32, 2, 64}, { 96,  96, 64, 32, 2, 64},
+    {112, 112, 64, 32, 2, 64}, {128, 128, 64, 32, 2, 64},
+    {192, 128, 16, 16, 1, 0},
+    {192, 192, 16, 16, 1, 0},
+    {256, 256, 16, 16, 16, 0},
+};
+
+struct ggml_opencl_fa_dim_table {
+    const ggml_opencl_fa_dim * data;
+    size_t                     count;
+
+    const ggml_opencl_fa_dim * begin() const { return data; }
+    const ggml_opencl_fa_dim * end()   const { return data + count; }
+};
+
+// Mutable copy of the active table; GGML_OPENCL_FA_TUNE patches entries here
+// at backend init without touching the const source table.
+static ggml_opencl_fa_dim g_fa_dims_runtime[
+    sizeof(g_fa_dims_adreno_default) / sizeof(g_fa_dims_adreno_default[0])];
+
+static ggml_opencl_fa_dim_table g_opencl_fa_dims = {
+    g_fa_dims_adreno_default,
+    sizeof(g_fa_dims_adreno_default) / sizeof(g_fa_dims_adreno_default[0]),
+};
+
+// GGML_OPENCL_FA_TUNE=dk:dv:bm:bn:nsplit:thr[,…] — patches matching entries
+// in the active table at backend init, before the first FA kernel compiles.
+// Unmatched (dk,dv) pairs are warned and ignored.
+static void ggml_opencl_fa_apply_env_overrides() {
+    const char * e = std::getenv("GGML_OPENCL_FA_TUNE");
+    if (!e || !e[0]) {
+        return;
+    }
+
+    std::string s = e;
+    size_t pos = 0;
+    while (pos < s.size()) {
+        size_t comma = s.find(',', pos);
+        std::string entry = s.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos);
+        int dk, dv, bm, bn, nsplit, thr;
+        if (std::sscanf(entry.c_str(), "%d:%d:%d:%d:%d:%d", &dk, &dv, &bm, &bn, &nsplit, &thr) == 6) {
+            bool patched = false;
+            for (size_t i = 0; i < g_opencl_fa_dims.count; ++i) {
+                ggml_opencl_fa_dim & d = g_fa_dims_runtime[i];
+                if (d.dk == dk && d.dv == dv) {
+                    d.bm = bm; d.bn = bn; d.n_split = nsplit; d.nkv_split_threshold = thr;
+                    GGML_LOG_INFO("ggml_opencl: FA tune override DK=%d DV=%d -> bm=%d bn=%d n_split=%d thr=%d\n",
+                                  dk, dv, bm, bn, nsplit, thr);
+                    patched = true;
+                    break;
+                }
+            }
+            if (!patched) {
+                GGML_LOG_WARN("ggml_opencl: FA tune override DK=%d DV=%d ignored (no matching dim)\n", dk, dv);
+            }
+        } else {
+            GGML_LOG_WARN("ggml_opencl: FA tune override entry malformed: '%s'\n", entry.c_str());
+        }
+        if (comma == std::string::npos) break;
+        pos = comma + 1;
+    }
+}
+
+// Copy the default table into the mutable runtime buffer and apply any
+// GGML_OPENCL_FA_TUNE overrides. A per-generation table can be added here
+// once it has been tuned on hardware.
+static void ggml_cl_init_fa_dims_table() {
+    const size_t count = sizeof(g_fa_dims_adreno_default) / sizeof(g_fa_dims_adreno_default[0]);
+    for (size_t i = 0; i < count; ++i) {
+        g_fa_dims_runtime[i] = g_fa_dims_adreno_default[i];
+    }
+    g_opencl_fa_dims = { g_fa_dims_runtime, count };
+    ggml_opencl_fa_apply_env_overrides();
+}
@@ -1582,6 +1582,158 @@ kernel void kernel_restore_block_q8_0(
    }
 }

+// View-aware AoS q8_0 -> f32 dequant (f32/f32 FA path).
+kernel void kernel_dequant_q8_0_f32_view_aos(
+    global char * src,
+    ulong         src_offset,
+    ulong         src_nb1,
+    ulong         src_nb2,
+    ulong         src_nb3,
+    int           nblk0,
+    int           ne1,
+    int           ne2,
+    int           ne3,
+    global float * dst
+) {
+    int blk_i0 = get_global_id(0);
+    int i1     = get_global_id(1);
+    int batch  = get_global_id(2);
+
+    if (blk_i0 >= nblk0) return;
+    if (i1     >= ne1)   return;
+
+    int i2 = batch % ne2;
+    int i3 = batch / ne2;
+    if (i3 >= ne3) return;
+
+    global char * block = src + src_offset + (ulong)i3*src_nb3 + (ulong)i2*src_nb2 + (ulong)i1*src_nb1 + (ulong)blk_i0 * (2 + QK8_0);
+    float d = vload_half(0, (global half *)block);
+    global char * qs = block + 2;
+
+    ulong dst_row_base = ((ulong)i3 * ne2 * ne1 + (ulong)i2 * ne1 + (ulong)i1) * nblk0;
+    global float * out = dst + (dst_row_base + blk_i0) * QK8_0;
+
+    for (int i = 0; i < QK8_0; ++i) {
+        out[i] = d * (float)qs[i];
+    }
+}
+
+// View-aware AoS q8_0 -> f16 dequant. Rows tight, batch strides may be gapped.
+kernel void kernel_dequant_q8_0_f16_view_aos(
+    global char * src,
+    ulong         src_offset,
+    ulong         src_nb1,
+    ulong         src_nb2,
+    ulong         src_nb3,
+    int           nblk0,
+    int           ne1,
+    int           ne2,
+    int           ne3,
+    global half * dst
+) {
+    int blk_i0 = get_global_id(0);
+    int i1     = get_global_id(1);
+    int batch  = get_global_id(2);
+
+    if (blk_i0 >= nblk0) return;
+    if (i1     >= ne1)   return;
+
+    int i2 = batch % ne2;
+    int i3 = batch / ne2;
+    if (i3 >= ne3) return;
+
+    global char * block = src + src_offset + (ulong)i3*src_nb3 + (ulong)i2*src_nb2 + (ulong)i1*src_nb1 + (ulong)blk_i0 * (2 + QK8_0);
+    float d = vload_half(0, (global half *)block);
+    global char * qs = block + 2;
+
+    ulong dst_row_base = ((ulong)i3 * ne2 * ne1 + (ulong)i2 * ne1 + (ulong)i1) * nblk0;
+    global half * out = dst + (dst_row_base + blk_i0) * QK8_0;
+
+    for (int i = 0; i < QK8_0; ++i) {
+        out[i] = (half)(d * (float)qs[i]);
+    }
+}
+
+// View-aware AoS q4_0 -> f32 dequant (mirrors the q8_0 view variant).
+kernel void kernel_dequant_q4_0_f32_view_aos(
+    global char * src,
+    ulong         src_offset,
+    ulong         src_nb1,
+    ulong         src_nb2,
+    ulong         src_nb3,
+    int           nblk0,
+    int           ne1,
+    int           ne2,
+    int           ne3,
+    global float * dst
+) {
+    int blk_i0 = get_global_id(0);
+    int i1     = get_global_id(1);
+    int batch  = get_global_id(2);
+
+    if (blk_i0 >= nblk0) return;
+    if (i1     >= ne1)   return;
+
+    int i2 = batch % ne2;
+    int i3 = batch / ne2;
+    if (i3 >= ne3) return;
+
+    global char * block = src + src_offset + (ulong)i3*src_nb3 + (ulong)i2*src_nb2 + (ulong)i1*src_nb1 + (ulong)blk_i0 * (2 + QK4_0/2);
+    float d = vload_half(0, (global half *)block);
+    global uchar * qs = (global uchar *)(block + 2);
+
+    ulong dst_row_base = ((ulong)i3 * ne2 * ne1 + (ulong)i2 * ne1 + (ulong)i1) * nblk0;
+    global float * out = dst + (dst_row_base + blk_i0) * QK4_0;
+
+    for (int i = 0; i < QK4_0/2; ++i) {
+        uchar byte = qs[i];
+        int q0 = (int)(byte & 0x0F) - 8;
+        int q1 = (int)(byte >> 4)   - 8;
+        out[i]            = d * (float)q0;
+        out[i + QK4_0/2]  = d * (float)q1;
+    }
+}
+
+// View-aware AoS q4_0 -> f16 dequant (mirrors the q8_0 view variant).
+kernel void kernel_dequant_q4_0_f16_view_aos(
+    global char * src,
+    ulong         src_offset,
+    ulong         src_nb1,
+    ulong         src_nb2,
+    ulong         src_nb3,
+    int           nblk0,
+    int           ne1,
+    int           ne2,
+    int           ne3,
+    global half * dst
+) {
+    int blk_i0 = get_global_id(0);
+    int i1     = get_global_id(1);
+    int batch  = get_global_id(2);
+
+    if (blk_i0 >= nblk0) return;
+    if (i1     >= ne1)   return;
+
+    int i2 = batch % ne2;
+    int i3 = batch / ne2;
+    if (i3 >= ne3) return;
+
+    global char * block = src + src_offset + (ulong)i3*src_nb3 + (ulong)i2*src_nb2 + (ulong)i1*src_nb1 + (ulong)blk_i0 * (2 + QK4_0/2);
+    float d = vload_half(0, (global half *)block);
+    global uchar * qs = (global uchar *)(block + 2);
+
+    ulong dst_row_base = ((ulong)i3 * ne2 * ne1 + (ulong)i2 * ne1 + (ulong)i1) * nblk0;
+    global half * out = dst + (dst_row_base + blk_i0) * QK4_0;
+
+    for (int i = 0; i < QK4_0/2; ++i) {
+        uchar byte = qs[i];
+        int q0 = (int)(byte & 0x0F) - 8;
+        int q1 = (int)(byte >> 4)   - 8;
+        out[i]          = (half)(d * (float)q0);
+        out[i + QK4_0/2] = (half)(d * (float)q1);
+    }
+}
+
 kernel void kernel_restore_block_q8_0_trans(
    global uchar * src_q,
    global half  * src_d,
@@ -4,14 +4,26 @@
 #define ACC_TYPE4 float4
 #define DATA_TYPE half
 #define DATA_TYPE4 half4
-#define CONVERT_ACC4(x) convert_float4(x)
-#define CONVERT_DATA4(x) convert_half4(x)
+#define CONVERT_ACC4(x) ((float4)((float)(x).s0, (float)(x).s1, (float)(x).s2, (float)(x).s3))
+#define CONVERT_DATA4(x) ((half4)((half)(x).s0, (half)(x).s1, (half)(x).s2, (half)(x).s3))

 #define DK_VEC (DK/4)
 #define DV_VEC (DV/4)
 #define WG_SIZE (BLOCK_M)
 #define Q1_WG_SIZE 64

+// The kernels are built with -cl-finite-math-only. On some older Adreno GPUs,
+// infinite operand can cause undefined behavior and miscompilation for exp.
+// Therefore, a large negative value is used instead.
+#define FA_M_INIT (-3.0e38f)
+
+// Drop full unroll at DK>=192 — Adreno compiler host-memory budget.
+#if DK >= 192
+#define FA_UNROLL
+#else
+#define FA_UNROLL _Pragma("unroll")
+#endif
+
 inline float get_alibi_slope(
    const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1
 ) {
@@ -81,18 +93,18 @@ __kernel void flash_attn_f16(
    if (my_query_row < n_q) {
        const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1;
        const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
-        #pragma unroll
+        FA_UNROLL
        for (int i = 0; i < DK_VEC; ++i) {
            q_priv[i] = CONVERT_ACC4(q_ptr[i]);
        }
    }

    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
+    FA_UNROLL
    for (int i = 0; i < DV_VEC; ++i) {
        o_acc[i] = (ACC_TYPE4)(0.0f);
    }
-    ACC_TYPE m_i = -INFINITY;
+    ACC_TYPE m_i = FA_M_INIT;
    ACC_TYPE l_i = 0.0f;

    float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
@@ -125,49 +137,72 @@ __kernel void flash_attn_f16(
            continue;
        }

-        for (int j = 0; j < BLOCK_N; j += 2) {
+        for (int j = 0; j < BLOCK_N; j += 4) {
            const int k_row0 = k_start + j;
            const int k_row1 = k_start + j + 1;
+            const int k_row2 = k_start + j + 2;
+            const int k_row3 = k_start + j + 3;

            ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
            ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
-            #pragma unroll
+            ACC_TYPE4 dot_acc2 = (ACC_TYPE4)(0.0f);
+            ACC_TYPE4 dot_acc3 = (ACC_TYPE4)(0.0f);
+            FA_UNROLL
            for (int k = 0; k < DK_VEC; k++) {
-                dot_acc0 = mad(q_priv[k], CONVERT_ACC4(l_k[j][k]), dot_acc0);
-                dot_acc1 = mad(q_priv[k], CONVERT_ACC4(l_k[j+1][k]), dot_acc1);
+                const ACC_TYPE4 qk = q_priv[k];
+                dot_acc0 = mad(qk, CONVERT_ACC4(l_k[j][k]),   dot_acc0);
+                dot_acc1 = mad(qk, CONVERT_ACC4(l_k[j+1][k]), dot_acc1);
+                dot_acc2 = mad(qk, CONVERT_ACC4(l_k[j+2][k]), dot_acc2);
+                dot_acc3 = mad(qk, CONVERT_ACC4(l_k[j+3][k]), dot_acc3);
            }
-            ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
-            ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
+            ACC_TYPE s0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
+            ACC_TYPE s1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
+            ACC_TYPE s2 = (dot_acc2.s0 + dot_acc2.s1 + dot_acc2.s2 + dot_acc2.s3) * scale;
+            ACC_TYPE s3 = (dot_acc3.s0 + dot_acc3.s1 + dot_acc3.s2 + dot_acc3.s3) * scale;

            if (is_causal) {
-                if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY;
-                if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY;
+                const int causal_limit = n_kv - n_q + my_query_row;
+                if (k_row0 > causal_limit) s0 = FA_M_INIT;
+                if (k_row1 > causal_limit) s1 = FA_M_INIT;
+                if (k_row2 > causal_limit) s2 = FA_M_INIT;
+                if (k_row3 > causal_limit) s3 = FA_M_INIT;
            }
-
-            if (k_row0 >= n_kv) score0 = -INFINITY;
-            if (k_row1 >= n_kv) score1 = -INFINITY;
+            if (k_row0 >= n_kv) s0 = FA_M_INIT;
+            if (k_row1 >= n_kv) s1 = FA_M_INIT;
+            if (k_row2 >= n_kv) s2 = FA_M_INIT;
+            if (k_row3 >= n_kv) s3 = FA_M_INIT;

            if (mask_base != NULL) {
                const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
-                if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
-                if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
+                if (k_row0 < n_kv) s0 += slope * (ACC_TYPE)mask_ptr[k_row0];
+                if (k_row1 < n_kv) s1 += slope * (ACC_TYPE)mask_ptr[k_row1];
+                if (k_row2 < n_kv) s2 += slope * (ACC_TYPE)mask_ptr[k_row2];
+                if (k_row3 < n_kv) s3 += slope * (ACC_TYPE)mask_ptr[k_row3];
            }

            if (logit_softcap > 0.0f) {
-                score0 = logit_softcap * tanh(score0 / logit_softcap);
-                score1 = logit_softcap * tanh(score1 / logit_softcap);
+                s0 = logit_softcap * tanh(s0 / logit_softcap);
+                s1 = logit_softcap * tanh(s1 / logit_softcap);
+                s2 = logit_softcap * tanh(s2 / logit_softcap);
+                s3 = logit_softcap * tanh(s3 / logit_softcap);
            }

-            const ACC_TYPE m_new = max(m_i, max(score0, score1));
-            const ACC_TYPE p0 = exp(score0 - m_new);
-            const ACC_TYPE p1 = exp(score1 - m_new);
-            const ACC_TYPE scale_prev = exp(m_i - m_new);
+            const ACC_TYPE m_new      = max(m_i, max(max(s0, s1), max(s2, s3)));
+            const ACC_TYPE scale_prev = native_exp(m_i - m_new);
+            const ACC_TYPE p0         = native_exp(s0 - m_new);
+            const ACC_TYPE p1         = native_exp(s1 - m_new);
+            const ACC_TYPE p2         = native_exp(s2 - m_new);
+            const ACC_TYPE p3         = native_exp(s3 - m_new);

-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
-                o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_ACC4(l_v[j][i]) + p1 * CONVERT_ACC4(l_v[j+1][i]);
+                o_acc[i] = mad(p3, CONVERT_ACC4(l_v[j+3][i]),
+                           mad(p2, CONVERT_ACC4(l_v[j+2][i]),
+                           mad(p1, CONVERT_ACC4(l_v[j+1][i]),
+                           mad(p0, CONVERT_ACC4(l_v[j][i]),
+                           o_acc[i] * scale_prev))));
            }
-            l_i = l_i * scale_prev + p0 + p1;
+            l_i = l_i * scale_prev + p0 + p1 + p2 + p3;
            m_i = m_new;
        }
    }
@@ -179,7 +214,7 @@ __kernel void flash_attn_f16(
            const ACC_TYPE m_final = max(m_i, m_sink);

            const ACC_TYPE scale_o = exp(m_i - m_final);
-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
                o_acc[i] *= scale_o;
            }
@@ -191,12 +226,12 @@ __kernel void flash_attn_f16(
        global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
        if (l_i > 0.0f) {
            const ACC_TYPE l_inv = 1.0f / l_i;
-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
                o_row[i] = CONVERT_DATA4(o_acc[i] * l_inv);
            }
        } else {
-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
                o_row[i] = (DATA_TYPE4)(0.0f);
            }
@@ -258,7 +293,7 @@ __kernel void flash_attn_f16_q1(
    ACC_TYPE4 q_priv[DK_VEC];
    const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2;
    const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
-    #pragma unroll
+    FA_UNROLL
    for (int i = 0; i < DK_VEC; ++i) {
        q_priv[i] = CONVERT_ACC4(q_ptr[i]);
    }
@@ -270,12 +305,12 @@ __kernel void flash_attn_f16_q1(
        sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
    }

-    ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : -INFINITY;
+    ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : FA_M_INIT;
    for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
        const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
+        FA_UNROLL
        for (int k = 0; k < DK_VEC; k++) {
            dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
        }
@@ -293,7 +328,7 @@ __kernel void flash_attn_f16_q1(
    __local ACC_TYPE local_m[Q1_WG_SIZE];
    local_m[tid] = m_i;
    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
+    FA_UNROLL
    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -301,7 +336,7 @@ __kernel void flash_attn_f16_q1(
    const ACC_TYPE m_final = local_m[0];

    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
+    FA_UNROLL
    for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
    ACC_TYPE l_i = 0.0f;

@@ -311,7 +346,7 @@ __kernel void flash_attn_f16_q1(
        const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
        const global DATA_TYPE4* v_ptr = (const global DATA_TYPE4*)(v_base + v_row_offset);
        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
+        FA_UNROLL
        for (int k = 0; k < DK_VEC; k++) {
            dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
        }
@@ -325,7 +360,7 @@ __kernel void flash_attn_f16_q1(
        }
        const ACC_TYPE p = exp(score - m_final);
        l_i += p;
-        #pragma unroll
+        FA_UNROLL
        for (int i = 0; i < DV_VEC; i++) {
            o_acc[i] = mad(p, CONVERT_ACC4(v_ptr[i]), o_acc[i]);
        }
@@ -335,7 +370,7 @@ __kernel void flash_attn_f16_q1(
    __local ACC_TYPE4 local_o_comp[Q1_WG_SIZE];
    local_l[tid] = l_i;
    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
+    FA_UNROLL
    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) local_l[tid] += local_l[tid + s];
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -354,7 +389,7 @@ __kernel void flash_attn_f16_q1(
        for (int i = 0; i < DV_VEC; i++) {
            local_o_comp[tid] = o_acc[i];
            barrier(CLK_LOCAL_MEM_FENCE);
-            #pragma unroll
+            FA_UNROLL
            for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
                if (tid < s) local_o_comp[tid] += local_o_comp[tid + s];
                barrier(CLK_LOCAL_MEM_FENCE);
@@ -364,7 +399,7 @@ __kernel void flash_attn_f16_q1(
            }
        }
    } else if (tid == 0) {
-        #pragma unroll
+        FA_UNROLL
        for (int i = 0; i < DV_VEC; ++i) o_row[i] = (DATA_TYPE4)(0.0f);
    }
 }
@@ -13,6 +13,18 @@
 #define WG_SIZE (BLOCK_M)
 #define Q1_WG_SIZE 64

+// The kernels are built with -cl-finite-math-only. On some older Adreno GPUs,
+// infinite operand can cause undefined behavior and miscompilation for exp.
+// Therefore, a large negative value is used instead.
+#define FA_M_INIT (-3.0e38f)
+
+// Drop full unroll at DK>=192 — Adreno compiler host-memory budget.
+#if DK >= 192
+#define FA_UNROLL
+#else
+#define FA_UNROLL _Pragma("unroll")
+#endif
+
 inline float get_alibi_slope(
    const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1
 ) {
@@ -82,18 +94,18 @@ __kernel void flash_attn_f32(
    if (my_query_row < n_q) {
        const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1;
        const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
-        #pragma unroll
+        FA_UNROLL
        for (int i = 0; i < DK_VEC; ++i) {
            q_priv[i] = CONVERT_ACC4(q_ptr[i]);
        }
    }

    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
+    FA_UNROLL
    for (int i = 0; i < DV_VEC; ++i) {
        o_acc[i] = (ACC_TYPE4)(0.0f);
    }
-    ACC_TYPE m_i = -INFINITY;
+    ACC_TYPE m_i = FA_M_INIT;
    ACC_TYPE l_i = 0.0f;

    float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
@@ -126,49 +138,72 @@ __kernel void flash_attn_f32(
            continue;
        }

-        for (int j = 0; j < BLOCK_N; j += 2) {
+        for (int j = 0; j < BLOCK_N; j += 4) {
            const int k_row0 = k_start + j;
            const int k_row1 = k_start + j + 1;
+            const int k_row2 = k_start + j + 2;
+            const int k_row3 = k_start + j + 3;

            ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
            ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
-            #pragma unroll
+            ACC_TYPE4 dot_acc2 = (ACC_TYPE4)(0.0f);
+            ACC_TYPE4 dot_acc3 = (ACC_TYPE4)(0.0f);
+            FA_UNROLL
            for (int k = 0; k < DK_VEC; k++) {
-                dot_acc0 = mad(q_priv[k], CONVERT_ACC4(l_k[j][k]), dot_acc0);
-                dot_acc1 = mad(q_priv[k], CONVERT_ACC4(l_k[j+1][k]), dot_acc1);
+                const ACC_TYPE4 qk = q_priv[k];
+                dot_acc0 = mad(qk, CONVERT_ACC4(l_k[j][k]),   dot_acc0);
+                dot_acc1 = mad(qk, CONVERT_ACC4(l_k[j+1][k]), dot_acc1);
+                dot_acc2 = mad(qk, CONVERT_ACC4(l_k[j+2][k]), dot_acc2);
+                dot_acc3 = mad(qk, CONVERT_ACC4(l_k[j+3][k]), dot_acc3);
            }
-            ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
-            ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
+            ACC_TYPE s0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
+            ACC_TYPE s1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
+            ACC_TYPE s2 = (dot_acc2.s0 + dot_acc2.s1 + dot_acc2.s2 + dot_acc2.s3) * scale;
+            ACC_TYPE s3 = (dot_acc3.s0 + dot_acc3.s1 + dot_acc3.s2 + dot_acc3.s3) * scale;

            if (is_causal) {
-                if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY;
-                if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY;
+                const int causal_limit = n_kv - n_q + my_query_row;
+                if (k_row0 > causal_limit) s0 = FA_M_INIT;
+                if (k_row1 > causal_limit) s1 = FA_M_INIT;
+                if (k_row2 > causal_limit) s2 = FA_M_INIT;
+                if (k_row3 > causal_limit) s3 = FA_M_INIT;
            }
-
-            if (k_row0 >= n_kv) score0 = -INFINITY;
-            if (k_row1 >= n_kv) score1 = -INFINITY;
+            if (k_row0 >= n_kv) s0 = FA_M_INIT;
+            if (k_row1 >= n_kv) s1 = FA_M_INIT;
+            if (k_row2 >= n_kv) s2 = FA_M_INIT;
+            if (k_row3 >= n_kv) s3 = FA_M_INIT;

            if (mask_base != NULL) {
                const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
-                if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
-                if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
+                if (k_row0 < n_kv) s0 += slope * (ACC_TYPE)mask_ptr[k_row0];
+                if (k_row1 < n_kv) s1 += slope * (ACC_TYPE)mask_ptr[k_row1];
+                if (k_row2 < n_kv) s2 += slope * (ACC_TYPE)mask_ptr[k_row2];
+                if (k_row3 < n_kv) s3 += slope * (ACC_TYPE)mask_ptr[k_row3];
            }

            if (logit_softcap > 0.0f) {
-                score0 = logit_softcap * tanh(score0 / logit_softcap);
-                score1 = logit_softcap * tanh(score1 / logit_softcap);
+                s0 = logit_softcap * tanh(s0 / logit_softcap);
+                s1 = logit_softcap * tanh(s1 / logit_softcap);
+                s2 = logit_softcap * tanh(s2 / logit_softcap);
+                s3 = logit_softcap * tanh(s3 / logit_softcap);
            }

-            const ACC_TYPE m_new = max(m_i, max(score0, score1));
-            const ACC_TYPE p0 = exp(score0 - m_new);
-            const ACC_TYPE p1 = exp(score1 - m_new);
-            const ACC_TYPE scale_prev = exp(m_i - m_new);
+            const ACC_TYPE m_new      = max(m_i, max(max(s0, s1), max(s2, s3)));
+            const ACC_TYPE scale_prev = native_exp(m_i - m_new);
+            const ACC_TYPE p0         = native_exp(s0 - m_new);
+            const ACC_TYPE p1         = native_exp(s1 - m_new);
+            const ACC_TYPE p2         = native_exp(s2 - m_new);
+            const ACC_TYPE p3         = native_exp(s3 - m_new);

-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
-                o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_ACC4(l_v[j][i]) + p1 * CONVERT_ACC4(l_v[j+1][i]);
+                o_acc[i] = mad(p3, CONVERT_ACC4(l_v[j+3][i]),
+                           mad(p2, CONVERT_ACC4(l_v[j+2][i]),
+                           mad(p1, CONVERT_ACC4(l_v[j+1][i]),
+                           mad(p0, CONVERT_ACC4(l_v[j][i]),
+                           o_acc[i] * scale_prev))));
            }
-            l_i = l_i * scale_prev + p0 + p1;
+            l_i = l_i * scale_prev + p0 + p1 + p2 + p3;
            m_i = m_new;
        }
    }
@@ -180,7 +215,7 @@ __kernel void flash_attn_f32(
            const ACC_TYPE m_final = max(m_i, m_sink);

            const ACC_TYPE scale_o = exp(m_i - m_final);
-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
                o_acc[i] *= scale_o;
            }
@@ -192,12 +227,12 @@ __kernel void flash_attn_f32(
        global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
        if (l_i > 0.0f) {
            const ACC_TYPE l_inv = 1.0f / l_i;
-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
                o_row[i] = CONVERT_DATA4(o_acc[i] * l_inv);
            }
        } else {
-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
                o_row[i] = (DATA_TYPE4)(0.0f);
            }
@@ -259,7 +294,7 @@ __kernel void flash_attn_f32_q1(
    ACC_TYPE4 q_priv[DK_VEC];
    const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2;
    const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
-    #pragma unroll
+    FA_UNROLL
    for (int i = 0; i < DK_VEC; ++i) {
        q_priv[i] = CONVERT_ACC4(q_ptr[i]);
    }
@@ -271,12 +306,12 @@ __kernel void flash_attn_f32_q1(
        sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
    }

-    ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : -INFINITY;
+    ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : FA_M_INIT;
    for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
        const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
+        FA_UNROLL
        for (int k = 0; k < DK_VEC; k++) {
            dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
        }
@@ -294,7 +329,7 @@ __kernel void flash_attn_f32_q1(
    __local ACC_TYPE local_m[Q1_WG_SIZE];
    local_m[tid] = m_i;
    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
+    FA_UNROLL
    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -302,7 +337,7 @@ __kernel void flash_attn_f32_q1(
    const ACC_TYPE m_final = local_m[0];

    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
+    FA_UNROLL
    for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
    ACC_TYPE l_i = 0.0f;

@@ -312,7 +347,7 @@ __kernel void flash_attn_f32_q1(
        const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
        const global DATA_TYPE4* v_ptr = (const global DATA_TYPE4*)(v_base + v_row_offset);
        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
+        FA_UNROLL
        for (int k = 0; k < DK_VEC; k++) {
            dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
        }
@@ -326,7 +361,7 @@ __kernel void flash_attn_f32_q1(
        }
        const ACC_TYPE p = exp(score - m_final);
        l_i += p;
-        #pragma unroll
+        FA_UNROLL
        for (int i = 0; i < DV_VEC; i++) {
            o_acc[i] = mad(p, CONVERT_ACC4(v_ptr[i]), o_acc[i]);
        }
@@ -336,7 +371,7 @@ __kernel void flash_attn_f32_q1(
    __local ACC_TYPE4 local_o_comp[Q1_WG_SIZE];
    local_l[tid] = l_i;
    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
+    FA_UNROLL
    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) local_l[tid] += local_l[tid + s];
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -355,7 +390,7 @@ __kernel void flash_attn_f32_q1(
        for (int i = 0; i < DV_VEC; i++) {
            local_o_comp[tid] = o_acc[i];
            barrier(CLK_LOCAL_MEM_FENCE);
-            #pragma unroll
+            FA_UNROLL
            for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
                if (tid < s) local_o_comp[tid] += local_o_comp[tid + s];
                barrier(CLK_LOCAL_MEM_FENCE);
@@ -365,7 +400,7 @@ __kernel void flash_attn_f32_q1(
            }
        }
    } else if (tid == 0) {
-        #pragma unroll
+        FA_UNROLL
        for (int i = 0; i < DV_VEC; ++i) o_row[i] = (DATA_TYPE4)(0.0f);
    }
 }
@@ -1,5 +1,13 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

+#ifdef cl_khr_subgroup_shuffle
+#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle : enable
+#define HAS_SUBGROUP_SHUFFLE 1
+#elif defined(cl_qcom_subgroup_shuffle)
+#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
+#define HAS_SUBGROUP_SHUFFLE 1
+#endif
+
 #define ACC_TYPE float
 #define ACC_TYPE4 float4
 #define Q_DATA_TYPE4 float4
@@ -12,9 +20,34 @@

 #define DK_VEC (DK/4)
 #define DV_VEC (DV/4)
-#define WG_SIZE (BLOCK_M)
 #define Q1_WG_SIZE 64

+// The kernels are built with -cl-finite-math-only. On some older Adreno GPUs,
+// infinite operand can cause undefined behavior and miscompilation for exp.
+// Therefore, a large negative value is used instead.
+#define FA_M_INIT (-3.0e38f)
+
+// Drop full unroll at DK>=192 — Adreno compiler host-memory budget.
+#if DK >= 192
+#define FA_UNROLL
+#else
+#define FA_UNROLL _Pragma("unroll")
+#endif
+
+// N_SPLIT>1 splits DK/DV across threads to cut per-thread register use.
+#ifndef N_SPLIT
+#define N_SPLIT 1
+#endif
+
+#define SPLIT_DK_VEC (DK_VEC / N_SPLIT)
+#define SPLIT_DV_VEC (DV_VEC / N_SPLIT)
+
+#if N_SPLIT > 1
+#define WG_SIZE (BLOCK_M * N_SPLIT)
+#else
+#define WG_SIZE (BLOCK_M)
+#endif
+
 inline float get_alibi_slope(
    const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1
 ) {
@@ -54,19 +87,38 @@ __kernel void flash_attn_f32_f16(
    const int mask_ne2,
    const int mask_ne3,
    const global void* sinks_void,
-    const ulong sinks_offset
+    const ulong sinks_offset,
+    const global void * k_pad_void,
+    const global void * v_pad_void,
+    const global void * mask_pad_void,
+    const global char * blk,
+    const int n_kv_blocks,
+    const ulong mask_pad_nb1,
+    const ulong mask_pad_nb2,
+    const ulong mask_pad_nb3
 ) {
    const int tid = get_local_id(0);
    const int block_q_idx = get_group_id(0);
    const int head_batch_idx = get_global_id(1);

-    const int my_query_row = block_q_idx * BLOCK_M + tid;
+#if N_SPLIT > 1
+    const int q_lane    = tid / N_SPLIT;
+    const int split_idx = tid % N_SPLIT;
+#else
+    const int q_lane    = tid;
+    const int split_idx = 0;
+#endif
+
+    const int my_query_row = block_q_idx * BLOCK_M + q_lane;
+    const int query_valid = my_query_row < n_q;

    const int batch_idx = head_batch_idx / n_head;
    const int head_idx = head_batch_idx % n_head;

    const int gqa_ratio = n_head / n_head_kv;
    const int head_kv_idx = head_idx / gqa_ratio;
+    const int mask_head_idx = mask_void != NULL ? head_idx % mask_ne2 : 0;
+    const int mask_batch_idx = mask_void != NULL ? batch_idx % mask_ne3 : 0;

    const global char* q_base = (const global char*)q_void + q_offset;
    const global char* k_base = (const global char*)k_void + k_offset;
@@ -75,27 +127,41 @@ __kernel void flash_attn_f32_f16(

    const global char* mask_base = NULL;
    if (mask_void != NULL) {
-        const int mask_head_idx = head_idx % mask_ne2;
-        const int mask_batch_idx = batch_idx % mask_ne3;
        mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2;
    }
+    const global char* mask_pad_base = NULL;
+    if (mask_pad_void != NULL) {
+        mask_pad_base = (const global char*)mask_pad_void + mask_batch_idx * mask_pad_nb3 + mask_head_idx * mask_pad_nb2;
+    }
+    const global char* blk_base = NULL;
+    if (blk != NULL) {
+        const int n_q_blocks = (n_q + BLOCK_M - 1) / BLOCK_M;
+        blk_base = blk + (((mask_batch_idx * mask_ne2) + mask_head_idx) * n_q_blocks + block_q_idx) * n_kv_blocks;
+    }

-    ACC_TYPE4 q_priv[DK_VEC];
-    if (my_query_row < n_q) {
+    ACC_TYPE4 q_priv[SPLIT_DK_VEC];
+    const int dk_off = split_idx * SPLIT_DK_VEC;
+    if (query_valid) {
        const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1;
        const global Q_DATA_TYPE4* q_ptr = (const global Q_DATA_TYPE4*)(q_base + q_row_offset);
-        #pragma unroll
-        for (int i = 0; i < DK_VEC; ++i) {
-            q_priv[i] = CONVERT_Q_ACC4(q_ptr[i]);
+        FA_UNROLL
+        for (int i = 0; i < SPLIT_DK_VEC; ++i) {
+            q_priv[i] = CONVERT_Q_ACC4(q_ptr[dk_off + i]);
+        }
+    } else {
+        FA_UNROLL
+        for (int i = 0; i < SPLIT_DK_VEC; ++i) {
+            q_priv[i] = (ACC_TYPE4)(0.0f);
        }
    }

-    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
-    for (int i = 0; i < DV_VEC; ++i) {
+    ACC_TYPE4 o_acc[SPLIT_DV_VEC];
+    FA_UNROLL
+    for (int i = 0; i < SPLIT_DV_VEC; ++i) {
        o_acc[i] = (ACC_TYPE4)(0.0f);
    }
-    ACC_TYPE m_i = -INFINITY;
+
+    ACC_TYPE m_i = FA_M_INIT;
    ACC_TYPE l_i = 0.0f;

    float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
@@ -103,86 +169,369 @@ __kernel void flash_attn_f32_f16(
    __local KV_DATA_TYPE4 l_k[BLOCK_N][DK_VEC];
    __local KV_DATA_TYPE4 l_v[BLOCK_N][DV_VEC];

+#if N_SPLIT > 1 && !defined(HAS_SUBGROUP_SHUFFLE)
+    __local ACC_TYPE local_partial[BLOCK_N][WG_SIZE];
+    __local ACC_TYPE local_p[BLOCK_M][BLOCK_N];
+    __local ACC_TYPE local_softmax_scale[BLOCK_M];
+    __local ACC_TYPE local_l_inv[BLOCK_M];
+#endif
+
    for (int k_start = 0; k_start < n_kv; k_start += BLOCK_N) {
+        char blk_cur = 1;
+        if (blk_base != NULL) {
+            blk_cur = blk_base[k_start / BLOCK_N];
+            if (blk_cur == 0) continue;
+        }
+
+        const int use_kv_pad = k_pad_void != NULL && k_start + BLOCK_N > n_kv;
+        const int k_tile_start = use_kv_pad ? 0 : k_start;
+        const ulong k_tile_nb2 = use_kv_pad ? (ulong) BLOCK_N * k_nb1 : k_nb2;
+        const ulong k_tile_nb3 = use_kv_pad ? (ulong) n_head_kv * k_tile_nb2 : k_nb3;
+        const ulong v_tile_nb2 = use_kv_pad ? (ulong) BLOCK_N * v_nb1 : v_nb2;
+        const ulong v_tile_nb3 = use_kv_pad ? (ulong) n_head_kv * v_tile_nb2 : v_nb3;
+        const global char* k_tile_base = use_kv_pad ? (const global char*) k_pad_void : k_base;
+        const global char* v_tile_base = use_kv_pad ? (const global char*) v_pad_void : v_base;
+
        for (int i = tid; i < BLOCK_N * DK_VEC; i += WG_SIZE) {
            const int row = i / DK_VEC;
            const int col = i % DK_VEC;
-            const int k_row_idx = k_start + row;
-            if (k_row_idx < n_kv) {
-                const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_row_idx * k_nb1;
-                l_k[row][col] = ((__global KV_DATA_TYPE4*)(k_base + k_row_offset))[col];
+            const int k_row_idx = k_tile_start + row;
+            if (use_kv_pad || k_row_idx < n_kv) {
+                const ulong k_row_offset = batch_idx * k_tile_nb3 + head_kv_idx * k_tile_nb2 + k_row_idx * k_nb1;
+                l_k[row][col] = ((__global KV_DATA_TYPE4*)(k_tile_base + k_row_offset))[col];
+            } else {
+                l_k[row][col] = (KV_DATA_TYPE4)(0.0h);
            }
        }
        for (int i = tid; i < BLOCK_N * DV_VEC; i += WG_SIZE) {
            const int row = i / DV_VEC;
            const int col = i % DV_VEC;
-            const int v_row_idx = k_start + row;
-            if (v_row_idx < n_kv) {
-                const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + v_row_idx * v_nb1;
-                l_v[row][col] = ((__global KV_DATA_TYPE4*)(v_base + v_row_offset))[col];
+            const int v_row_idx = k_tile_start + row;
+            if (use_kv_pad || v_row_idx < n_kv) {
+                const ulong v_row_offset = batch_idx * v_tile_nb3 + head_kv_idx * v_tile_nb2 + v_row_idx * v_nb1;
+                l_v[row][col] = ((__global KV_DATA_TYPE4*)(v_tile_base + v_row_offset))[col];
+            } else {
+                l_v[row][col] = (KV_DATA_TYPE4)(0.0h);
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);

-        if (my_query_row >= n_q) {
-            continue;
+#if N_SPLIT > 1 && defined(HAS_SUBGROUP_SHUFFLE)
+        {
+            const int dv_off = split_idx * SPLIT_DV_VEC;
+            for (int j = 0; j < BLOCK_N; j += 2) {
+                const int k_row0 = k_start + j;
+                const int k_row1 = k_start + j + 1;
+
+                ACC_TYPE partial0 = 0.0f;
+                ACC_TYPE partial1 = 0.0f;
+                FA_UNROLL
+                for (int k = 0; k < SPLIT_DK_VEC; k++) {
+                    const ACC_TYPE4 qk = q_priv[k];
+                    ACC_TYPE4 dot0 = qk * CONVERT_KV_ACC4(l_k[j  ][dk_off + k]);
+                    ACC_TYPE4 dot1 = qk * CONVERT_KV_ACC4(l_k[j+1][dk_off + k]);
+                    partial0 += dot0.s0 + dot0.s1 + dot0.s2 + dot0.s3;
+                    partial1 += dot1.s0 + dot1.s1 + dot1.s2 + dot1.s3;
+                }
+
+                FA_UNROLL
+                for (int step = 1; step < N_SPLIT; step <<= 1) {
+                    partial0 += sub_group_shuffle_xor(partial0, step);
+                    partial1 += sub_group_shuffle_xor(partial1, step);
+                }
+
+                ACC_TYPE score0 = partial0 * scale;
+                ACC_TYPE score1 = partial1 * scale;
+
+                if (!query_valid) { score0 = FA_M_INIT; score1 = FA_M_INIT; }
+                if (is_causal) {
+                    if (k_row0 > (n_kv - n_q + my_query_row)) score0 = FA_M_INIT;
+                    if (k_row1 > (n_kv - n_q + my_query_row)) score1 = FA_M_INIT;
+                }
+                if (k_row0 >= n_kv) score0 = FA_M_INIT;
+                if (k_row1 >= n_kv) score1 = FA_M_INIT;
+
+                if (query_valid && mask_base != NULL && blk_cur != 2) {
+                    if (use_kv_pad && mask_pad_base != NULL) {
+                        const global MASK_DATA_TYPE* mask_ptr =
+                            (const global MASK_DATA_TYPE*)(mask_pad_base + my_query_row * mask_pad_nb1);
+                        score0 += slope * (ACC_TYPE)mask_ptr[j];
+                        score1 += slope * (ACC_TYPE)mask_ptr[j + 1];
+                    } else {
+                        const global MASK_DATA_TYPE* mask_ptr =
+                            (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
+                        if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
+                        if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
+                    }
+                }
+
+                if (logit_softcap > 0.0f) {
+                    score0 = logit_softcap * tanh(score0 / logit_softcap);
+                    score1 = logit_softcap * tanh(score1 / logit_softcap);
+                }
+
+                const ACC_TYPE m_new = max(m_i, max(score0, score1));
+                // Whole tile masked (m_new == FA_M_INIT): force the exp() args
+                // far negative so the tile contributes 0, not exp(0)=1.
+                const ACC_TYPE m_exp = (m_new == FA_M_INIT) ? 0.0f : m_new;
+                const ACC_TYPE sp    = native_exp(m_i - m_exp);
+                const ACC_TYPE p0    = native_exp(score0 - m_exp);
+                const ACC_TYPE p1    = native_exp(score1 - m_exp);
+
+                FA_UNROLL
+                for (int i = 0; i < SPLIT_DV_VEC; ++i) {
+                    o_acc[i] = o_acc[i] * sp
+                             + p0 * CONVERT_KV_ACC4(l_v[j  ][dv_off + i])
+                             + p1 * CONVERT_KV_ACC4(l_v[j+1][dv_off + i]);
+                }
+                l_i = l_i * sp + p0 + p1;
+                m_i = m_new;
+            }
        }
-
-        for (int j = 0; j < BLOCK_N; j += 2) {
-            const int k_row0 = k_start + j;
-            const int k_row1 = k_start + j + 1;
-
-            ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
-            ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
-            #pragma unroll
-            for (int k = 0; k < DK_VEC; k++) {
-                dot_acc0 = mad(q_priv[k], CONVERT_KV_ACC4(l_k[j][k]), dot_acc0);
-                dot_acc1 = mad(q_priv[k], CONVERT_KV_ACC4(l_k[j+1][k]), dot_acc1);
+#elif N_SPLIT > 1
+        // N_SPLIT>1 fallback (no shuffle): 3-phase local-memory reduction.
+        // Phase 1 — partial dots for all BLOCK_N tokens.
+        for (int j = 0; j < BLOCK_N; ++j) {
+            ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
+            FA_UNROLL
+            for (int k = 0; k < SPLIT_DK_VEC; k++) {
+                dot_acc = mad(q_priv[k], CONVERT_KV_ACC4(l_k[j][dk_off + k]), dot_acc);
            }
-            ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
-            ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
-
-            if (is_causal) {
-                if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY;
-                if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY;
-            }
-
-            if (k_row0 >= n_kv) score0 = -INFINITY;
-            if (k_row1 >= n_kv) score1 = -INFINITY;
-
-            if (mask_base != NULL) {
-                const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
-                if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
-                if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
-            }
-
-            if (logit_softcap > 0.0f) {
-                score0 = logit_softcap * tanh(score0 / logit_softcap);
-                score1 = logit_softcap * tanh(score1 / logit_softcap);
-            }
-
-            const ACC_TYPE m_new = max(m_i, max(score0, score1));
-            const ACC_TYPE p0 = exp(score0 - m_new);
-            const ACC_TYPE p1 = exp(score1 - m_new);
-            const ACC_TYPE scale_prev = exp(m_i - m_new);
-
-            #pragma unroll
-            for (int i = 0; i < DV_VEC; ++i) {
-                o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_KV_ACC4(l_v[j][i]) + p1 * CONVERT_KV_ACC4(l_v[j+1][i]);
-            }
-            l_i = l_i * scale_prev + p0 + p1;
-            m_i = m_new;
+            local_partial[j][tid] =
+                dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3;
        }
+        barrier(CLK_LOCAL_MEM_FENCE);  // 1 barrier: partial dots visible
+
+        // Phase 2 — split_idx==0 reduces partial sums and computes block softmax.
+        if (split_idx == 0) {
+            if (query_valid) {
+                ACC_TYPE m_new = m_i;
+                for (int j = 0; j < BLOCK_N; ++j) {
+                    const int k_row = k_start + j;
+                    ACC_TYPE score = 0.0f;
+                    FA_UNROLL
+                    for (int s = 0; s < N_SPLIT; s++) {
+                        score += local_partial[j][q_lane * N_SPLIT + s];
+                    }
+                    score *= scale;
+
+                    if (is_causal && k_row > (n_kv - n_q + my_query_row)) score = FA_M_INIT;
+                    if (k_row >= n_kv) score = FA_M_INIT;
+
+                    if (mask_base != NULL && blk_cur != 2) {
+                        if (use_kv_pad && mask_pad_base != NULL) {
+                            const global MASK_DATA_TYPE* mask_ptr =
+                                (const global MASK_DATA_TYPE*)(mask_pad_base + my_query_row * mask_pad_nb1);
+                            score += slope * (ACC_TYPE)mask_ptr[j];
+                        } else {
+                            const global MASK_DATA_TYPE* mask_ptr =
+                                (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
+                            if (k_row < n_kv) score += slope * (ACC_TYPE)mask_ptr[k_row];
+                        }
+                    }
+
+                    if (logit_softcap > 0.0f) {
+                        score = logit_softcap * tanh(score / logit_softcap);
+                    }
+
+                    m_new = max(m_new, score);
+                    local_p[q_lane][j] = score;
+                }
+
+                const ACC_TYPE m_exp = (m_new == FA_M_INIT) ? 0.0f : m_new;
+                const ACC_TYPE sp = native_exp(m_i - m_exp);
+                ACC_TYPE l_new = l_i * sp;
+                for (int j = 0; j < BLOCK_N; ++j) {
+                    const ACC_TYPE p = native_exp(local_p[q_lane][j] - m_exp);
+                    local_p[q_lane][j] = p;
+                    l_new += p;
+                }
+                local_softmax_scale[q_lane] = sp;
+                l_i = l_new;
+                m_i = m_new;
+            } else {
+                local_softmax_scale[q_lane] = 1.0f;
+                for (int j = 0; j < BLOCK_N; ++j) local_p[q_lane][j] = 0.0f;
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Phase 3 — V accumulate using broadcast probabilities.
+        {
+            const ACC_TYPE sp_block = local_softmax_scale[q_lane];
+            const int dv_off = split_idx * SPLIT_DV_VEC;
+            FA_UNROLL
+            for (int i = 0; i < SPLIT_DV_VEC; ++i) {
+                o_acc[i] *= sp_block;
+            }
+            for (int j = 0; j < BLOCK_N; ++j) {
+                const ACC_TYPE p = local_p[q_lane][j];
+                FA_UNROLL
+                for (int i = 0; i < SPLIT_DV_VEC; ++i) {
+                    o_acc[i] = mad(p, CONVERT_KV_ACC4(l_v[j][dv_off + i]), o_acc[i]);
+                }
+            }
+        }
+#else
+        // N_SPLIT==1: j+=4 unroll. Requires BLOCK_N % 4 == 0.
+        if (query_valid) {
+            for (int j = 0; j < BLOCK_N; j += 4) {
+                const int k_row0 = k_start + j;
+                const int k_row1 = k_start + j + 1;
+                const int k_row2 = k_start + j + 2;
+                const int k_row3 = k_start + j + 3;
+
+                ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
+                ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
+                ACC_TYPE4 dot_acc2 = (ACC_TYPE4)(0.0f);
+                ACC_TYPE4 dot_acc3 = (ACC_TYPE4)(0.0f);
+                FA_UNROLL
+                for (int k = 0; k < DK_VEC; k++) {
+                    const ACC_TYPE4 qk = q_priv[k];
+                    dot_acc0 = mad(qk, CONVERT_KV_ACC4(l_k[j][k]),   dot_acc0);
+                    dot_acc1 = mad(qk, CONVERT_KV_ACC4(l_k[j+1][k]), dot_acc1);
+                    dot_acc2 = mad(qk, CONVERT_KV_ACC4(l_k[j+2][k]), dot_acc2);
+                    dot_acc3 = mad(qk, CONVERT_KV_ACC4(l_k[j+3][k]), dot_acc3);
+                }
+                ACC_TYPE s0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
+                ACC_TYPE s1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
+                ACC_TYPE s2 = (dot_acc2.s0 + dot_acc2.s1 + dot_acc2.s2 + dot_acc2.s3) * scale;
+                ACC_TYPE s3 = (dot_acc3.s0 + dot_acc3.s1 + dot_acc3.s2 + dot_acc3.s3) * scale;
+
+                if (is_causal) {
+                    const int causal_limit = n_kv - n_q + my_query_row;
+                    if (k_row0 > causal_limit) s0 = FA_M_INIT;
+                    if (k_row1 > causal_limit) s1 = FA_M_INIT;
+                    if (k_row2 > causal_limit) s2 = FA_M_INIT;
+                    if (k_row3 > causal_limit) s3 = FA_M_INIT;
+                }
+                if (k_row0 >= n_kv) s0 = FA_M_INIT;
+                if (k_row1 >= n_kv) s1 = FA_M_INIT;
+                if (k_row2 >= n_kv) s2 = FA_M_INIT;
+                if (k_row3 >= n_kv) s3 = FA_M_INIT;
+
+                if (mask_base != NULL && blk_cur != 2) {
+                    if (use_kv_pad && mask_pad_base != NULL) {
+                        const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_pad_base + my_query_row * mask_pad_nb1);
+                        s0 += slope * (ACC_TYPE)mask_ptr[j];
+                        s1 += slope * (ACC_TYPE)mask_ptr[j + 1];
+                        s2 += slope * (ACC_TYPE)mask_ptr[j + 2];
+                        s3 += slope * (ACC_TYPE)mask_ptr[j + 3];
+                    } else {
+                        const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
+                        if (k_row0 < n_kv) s0 += slope * (ACC_TYPE)mask_ptr[k_row0];
+                        if (k_row1 < n_kv) s1 += slope * (ACC_TYPE)mask_ptr[k_row1];
+                        if (k_row2 < n_kv) s2 += slope * (ACC_TYPE)mask_ptr[k_row2];
+                        if (k_row3 < n_kv) s3 += slope * (ACC_TYPE)mask_ptr[k_row3];
+                    }
+                }
+
+                if (logit_softcap > 0.0f) {
+                    s0 = logit_softcap * tanh(s0 / logit_softcap);
+                    s1 = logit_softcap * tanh(s1 / logit_softcap);
+                    s2 = logit_softcap * tanh(s2 / logit_softcap);
+                    s3 = logit_softcap * tanh(s3 / logit_softcap);
+                }
+
+                const ACC_TYPE m_new      = max(m_i, max(max(s0, s1), max(s2, s3)));
+                // Whole tile masked (m_new == FA_M_INIT): force the exp() args
+                // far negative so the tile contributes 0, not exp(0)=1.
+                const ACC_TYPE m_exp      = (m_new == FA_M_INIT) ? 0.0f : m_new;
+                const ACC_TYPE scale_prev = native_exp(m_i - m_exp);
+                const ACC_TYPE p0         = native_exp(s0 - m_exp);
+                const ACC_TYPE p1         = native_exp(s1 - m_exp);
+                const ACC_TYPE p2         = native_exp(s2 - m_exp);
+                const ACC_TYPE p3         = native_exp(s3 - m_exp);
+
+                FA_UNROLL
+                for (int i = 0; i < DV_VEC; ++i) {
+                    o_acc[i] = mad(p3, CONVERT_KV_ACC4(l_v[j+3][i]),
+                               mad(p2, CONVERT_KV_ACC4(l_v[j+2][i]),
+                               mad(p1, CONVERT_KV_ACC4(l_v[j+1][i]),
+                               mad(p0, CONVERT_KV_ACC4(l_v[j][i]),
+                               o_acc[i] * scale_prev))));
+                }
+                l_i = l_i * scale_prev + p0 + p1 + p2 + p3;
+                m_i = m_new;
+            }
+        }
+#endif
+        // End of tile: every thread must finish reading l_k/l_v before the
+        // next iteration's load overwrites them (WAR hazard on local memory).
+        barrier(CLK_LOCAL_MEM_FENCE);
    }

-    if (my_query_row < n_q) {
+    // Write output.
+#if N_SPLIT > 1 && defined(HAS_SUBGROUP_SHUFFLE)
+    if (query_valid) {
+        ACC_TYPE sinks_sp = 1.0f;
+        if (sinks_void != NULL) {
+            const global ACC_TYPE* sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
+            const ACC_TYPE m_sink  = sinks_ptr[head_idx];
+            const ACC_TYPE m_final = max(m_i, m_sink);
+            sinks_sp = exp(m_i - m_final);
+            l_i = l_i * sinks_sp + exp(m_sink - m_final);
+            m_i = m_final;
+        }
+        const ACC_TYPE l_inv = (l_i > 0.0f) ? (1.0f / l_i) : 0.0f;
+        const int dv_off = split_idx * SPLIT_DV_VEC;
+        const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1;
+        global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset);
+        if (l_inv > 0.0f) {
+            FA_UNROLL
+            for (int i = 0; i < SPLIT_DV_VEC; ++i) {
+                o_row[dv_off + i] = CONVERT_O_DATA4(o_acc[i] * sinks_sp * l_inv);
+            }
+        } else {
+            FA_UNROLL
+            for (int i = 0; i < SPLIT_DV_VEC; ++i) {
+                o_row[dv_off + i] = (O_DATA_TYPE4)(0.0f);
+            }
+        }
+    }
+#elif N_SPLIT > 1
+    if (split_idx == 0) {
+        ACC_TYPE sinks_sp = 1.0f;
+        if (query_valid && sinks_void != NULL) {
+            const global ACC_TYPE* sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
+            const ACC_TYPE m_sink = sinks_ptr[head_idx];
+            const ACC_TYPE m_final = max(m_i, m_sink);
+            sinks_sp = exp(m_i - m_final);
+            l_i = l_i * sinks_sp + exp(m_sink - m_final);
+            m_i = m_final;
+        }
+        local_softmax_scale[q_lane] = sinks_sp;
+        local_l_inv[q_lane] = (query_valid && l_i > 0.0f) ? (1.0f / l_i) : 0.0f;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (query_valid) {
+        const ACC_TYPE sinks_sp = local_softmax_scale[q_lane];
+        const ACC_TYPE l_inv    = local_l_inv[q_lane];
+        const int dv_off = split_idx * SPLIT_DV_VEC;
+        const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1;
+        global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset);
+        if (l_inv > 0.0f) {
+            FA_UNROLL
+            for (int i = 0; i < SPLIT_DV_VEC; ++i) {
+                o_row[dv_off + i] = CONVERT_O_DATA4(o_acc[i] * sinks_sp * l_inv);
+            }
+        } else {
+            FA_UNROLL
+            for (int i = 0; i < SPLIT_DV_VEC; ++i) {
+                o_row[dv_off + i] = (O_DATA_TYPE4)(0.0f);
+            }
+        }
+    }
+#else
+    if (query_valid) {
        if (sinks_void != NULL) {
            const global ACC_TYPE* sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
            const ACC_TYPE m_sink = sinks_ptr[head_idx];
            const ACC_TYPE m_final = max(m_i, m_sink);

            const ACC_TYPE scale_o = exp(m_i - m_final);
-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
                o_acc[i] *= scale_o;
            }
@@ -194,17 +543,18 @@ __kernel void flash_attn_f32_f16(
        global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset);
        if (l_i > 0.0f) {
            const ACC_TYPE l_inv = 1.0f / l_i;
-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
                o_row[i] = CONVERT_O_DATA4(o_acc[i] * l_inv);
            }
        } else {
-            #pragma unroll
+            FA_UNROLL
            for (int i = 0; i < DV_VEC; ++i) {
                o_row[i] = (O_DATA_TYPE4)(0.0f);
            }
        }
    }
+#endif
 }

 __kernel void flash_attn_f32_f16_q1(
@@ -258,13 +608,16 @@ __kernel void flash_attn_f32_f16_q1(
        mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2;
    }

-    ACC_TYPE4 q_priv[DK_VEC];
+    // Q is uniform across WG threads (n_q=1). Share via local memory to
+    // avoid per-thread q_priv[DK_VEC] dynamic-indexed private array that
+    // spills to DDR on Adreno.
+    __local ACC_TYPE4 q_shared[DK_VEC];
    const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2;
    const global Q_DATA_TYPE4* q_ptr = (const global Q_DATA_TYPE4*)(q_base + q_row_offset);
-    #pragma unroll
-    for (int i = 0; i < DK_VEC; ++i) {
-        q_priv[i] = CONVERT_Q_ACC4(q_ptr[i]);
+    for (int i = tid; i < DK_VEC; i += Q1_WG_SIZE) {
+        q_shared[i] = CONVERT_Q_ACC4(q_ptr[i]);
    }
+    barrier(CLK_LOCAL_MEM_FENCE);

    float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);

@@ -273,14 +626,14 @@ __kernel void flash_attn_f32_f16_q1(
        sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
    }

-    ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : -INFINITY;
+    ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : FA_M_INIT;
    for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
        const global KV_DATA_TYPE4* k_ptr = (const global KV_DATA_TYPE4*)(k_base + k_row_offset);
        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
+        FA_UNROLL
        for (int k = 0; k < DK_VEC; k++) {
-            dot_acc = mad(q_priv[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
+            dot_acc = mad(q_shared[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
        }
        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
        if (mask_base != NULL) {
@@ -296,7 +649,7 @@ __kernel void flash_attn_f32_f16_q1(
    __local ACC_TYPE local_m[Q1_WG_SIZE];
    local_m[tid] = m_i;
    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
+    FA_UNROLL
    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -304,7 +657,7 @@ __kernel void flash_attn_f32_f16_q1(
    const ACC_TYPE m_final = local_m[0];

    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
+    FA_UNROLL
    for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
    ACC_TYPE l_i = 0.0f;

@@ -314,9 +667,9 @@ __kernel void flash_attn_f32_f16_q1(
        const global KV_DATA_TYPE4* k_ptr = (const global KV_DATA_TYPE4*)(k_base + k_row_offset);
        const global KV_DATA_TYPE4* v_ptr = (const global KV_DATA_TYPE4*)(v_base + v_row_offset);
        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
+        FA_UNROLL
        for (int k = 0; k < DK_VEC; k++) {
-            dot_acc = mad(q_priv[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
+            dot_acc = mad(q_shared[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
        }
        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
        if (mask_base != NULL) {
@@ -328,7 +681,7 @@ __kernel void flash_attn_f32_f16_q1(
        }
        const ACC_TYPE p = exp(score - m_final);
        l_i += p;
-        #pragma unroll
+        FA_UNROLL
        for (int i = 0; i < DV_VEC; i++) {
            o_acc[i] = mad(p, CONVERT_KV_ACC4(v_ptr[i]), o_acc[i]);
        }
@@ -338,7 +691,7 @@ __kernel void flash_attn_f32_f16_q1(
    __local ACC_TYPE4 local_o_comp[Q1_WG_SIZE];
    local_l[tid] = l_i;
    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
+    FA_UNROLL
    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) local_l[tid] += local_l[tid + s];
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -357,7 +710,7 @@ __kernel void flash_attn_f32_f16_q1(
        for (int i = 0; i < DV_VEC; i++) {
            local_o_comp[tid] = o_acc[i];
            barrier(CLK_LOCAL_MEM_FENCE);
-            #pragma unroll
+            FA_UNROLL
            for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
                if (tid < s) local_o_comp[tid] += local_o_comp[tid + s];
                barrier(CLK_LOCAL_MEM_FENCE);
@@ -367,7 +720,257 @@ __kernel void flash_attn_f32_f16_q1(
            }
        }
    } else if (tid == 0) {
-        #pragma unroll
+        FA_UNROLL
        for (int i = 0; i < DV_VEC; ++i) o_row[i] = (O_DATA_TYPE4)(0.0f);
    }
 }
+
+// Flash-decoding split pass. gid(2) = q_idx * n_splits + split_idx.
+// Partial record per split: [m, l, O[DV]]. Merge kernel applies sink + norm.
+#define FA_PARTIAL_FLOATS (2 + DV)
+
+__kernel void flash_attn_f32_f16_q1_split(
+    const global void * q_void, ulong q_offset,
+    const global void * k_void, ulong k_offset,
+    const global void * v_void, ulong v_offset,
+    const float scale,
+    const int n_q,
+    const int n_kv,
+    const int n_head,
+    const ulong q_nb1, const ulong q_nb2, const ulong q_nb3,
+    const ulong k_nb1, const ulong k_nb2, const ulong k_nb3,
+    const ulong v_nb1, const ulong v_nb2, const ulong v_nb3,
+    const float max_bias,
+    const float m0,
+    const float m1,
+    const int n_head_log2,
+    const float logit_softcap,
+    const int n_head_kv,
+    const global void * mask_void,
+    const ulong mask_offset,
+    const ulong mask_nb1,
+    const ulong mask_nb2,
+    const ulong mask_nb3,
+    const int mask_ne2,
+    const int mask_ne3,
+    global float * partial_void,
+    const int n_splits,
+    const int kv_per_split
+) {
+    const int tid              = get_local_id(0);
+    const int head_batch_idx   = get_global_id(1);
+    const int split_q_idx      = get_global_id(2);
+    const int split_idx        = split_q_idx % n_splits;
+    const int q_idx            = split_q_idx / n_splits;
+    const int batch_idx        = head_batch_idx / n_head;
+    const int head_idx         = head_batch_idx % n_head;
+    const int gqa_ratio        = n_head / n_head_kv;
+    const int head_kv_idx      = head_idx / gqa_ratio;
+
+    const int kv_start = split_idx * kv_per_split;
+    const int kv_end   = min(kv_start + kv_per_split, n_kv);
+
+    const ulong record_stride = (ulong) FA_PARTIAL_FLOATS;
+    const ulong record_idx    = ((((ulong) batch_idx * n_head + head_idx) * n_q + q_idx)
+                                 * n_splits + split_idx);
+    global float  * rec       = partial_void + record_idx * record_stride;
+    global float4 * rec_o     = (global float4 *) (rec + 2);
+
+    if (kv_start >= kv_end) {
+        // Empty split: leave sentinel partial for merge.
+        if (tid == 0) {
+            rec[0] = FA_M_INIT;
+            rec[1] = 0.0f;
+        }
+        return;
+    }
+
+    const global char * q_base = (const global char *) q_void + q_offset;
+    const global char * k_base = (const global char *) k_void + k_offset;
+    const global char * v_base = (const global char *) v_void + v_offset;
+
+    const global char * mask_base = NULL;
+    if (mask_void != NULL) {
+        const int mask_head_idx  = head_idx  % mask_ne2;
+        const int mask_batch_idx = batch_idx % mask_ne3;
+        mask_base = (const global char *) mask_void + mask_offset +
+                    mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2 +
+                    (ulong) q_idx * mask_nb1;
+    }
+
+    // Share Q via local memory (n_q=1 per split -> uniform across WG).
+    __local ACC_TYPE4 q_shared[DK_VEC];
+    const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + (ulong) q_idx * q_nb1;
+    const global Q_DATA_TYPE4 * q_ptr = (const global Q_DATA_TYPE4 *) (q_base + q_row_offset);
+    for (int i = tid; i < DK_VEC; i += Q1_WG_SIZE) {
+        q_shared[i] = CONVERT_Q_ACC4(q_ptr[i]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
+
+    // Pass 1a — split-local max.
+    ACC_TYPE m_i = FA_M_INIT;
+    for (int k_idx = kv_start + tid; k_idx < kv_end; k_idx += Q1_WG_SIZE) {
+        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
+        const global KV_DATA_TYPE4 * k_ptr = (const global KV_DATA_TYPE4 *) (k_base + k_row_offset);
+        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
+        #pragma unroll
+        for (int k = 0; k < DK_VEC; ++k) {
+            dot_acc = mad(q_shared[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
+        }
+        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
+        if (mask_base != NULL) {
+            const global MASK_DATA_TYPE * mask_ptr = (const global MASK_DATA_TYPE *) (mask_base);
+            score += slope * (ACC_TYPE) mask_ptr[k_idx];
+        }
+        if (logit_softcap > 0.0f) {
+            score = logit_softcap * tanh(score / logit_softcap);
+        }
+        m_i = max(m_i, score);
+    }
+
+    __local ACC_TYPE local_m[Q1_WG_SIZE];
+    local_m[tid] = m_i;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    #pragma unroll
+    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    const ACC_TYPE m_c = local_m[0];
+
+    // Pass 1b — softmax-weighted V accumulate.
+    ACC_TYPE4 o_acc[DV_VEC];
+    #pragma unroll
+    for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
+    ACC_TYPE l_i = 0.0f;
+
+    for (int k_idx = kv_start + tid; k_idx < kv_end; k_idx += Q1_WG_SIZE) {
+        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
+        const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + k_idx * v_nb1;
+        const global KV_DATA_TYPE4 * k_ptr = (const global KV_DATA_TYPE4 *) (k_base + k_row_offset);
+        const global KV_DATA_TYPE4 * v_ptr = (const global KV_DATA_TYPE4 *) (v_base + v_row_offset);
+        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
+        #pragma unroll
+        for (int k = 0; k < DK_VEC; ++k) {
+            dot_acc = mad(q_shared[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
+        }
+        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
+        if (mask_base != NULL) {
+            const global MASK_DATA_TYPE * mask_ptr = (const global MASK_DATA_TYPE *) (mask_base);
+            score += slope * (ACC_TYPE) mask_ptr[k_idx];
+        }
+        if (logit_softcap > 0.0f) {
+            score = logit_softcap * tanh(score / logit_softcap);
+        }
+        const ACC_TYPE p = exp(score - m_c);
+        l_i += p;
+        #pragma unroll
+        for (int i = 0; i < DV_VEC; ++i) {
+            o_acc[i] = mad(p, CONVERT_KV_ACC4(v_ptr[i]), o_acc[i]);
+        }
+    }
+
+    __local ACC_TYPE  local_l[Q1_WG_SIZE];
+    __local ACC_TYPE4 local_o[Q1_WG_SIZE];
+    local_l[tid] = l_i;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    #pragma unroll
+    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) local_l[tid] += local_l[tid + s];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    const ACC_TYPE l_c = local_l[0];
+
+    if (tid == 0) {
+        rec[0] = (float) m_c;
+        rec[1] = (float) l_c;
+    }
+    for (int i = 0; i < DV_VEC; ++i) {
+        local_o[tid] = o_acc[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        #pragma unroll
+        for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
+            if (tid < s) local_o[tid] += local_o[tid + s];
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        if (tid == 0) {
+            rec_o[i] = local_o[0];
+        }
+    }
+}
+
+// FD Pass 2: merge per-split partials into final O. Empty splits drop via exp(-INF)=0.
+__kernel void flash_attn_f32_merge(
+    const global float * partial_void,
+    global void * o_void,
+    const ulong o_offset,
+    const int n_head,
+    const int n_splits,
+    const ulong o_nb1, const ulong o_nb2, const ulong o_nb3,
+    const global void * sinks_void,
+    const ulong sinks_offset,
+    const int n_q
+) {
+    const int lane           = get_local_id(0);  // 0..DV_VEC-1
+    const int head_batch_idx = get_global_id(1);
+    const int q_idx          = get_global_id(2);
+    const int batch_idx      = head_batch_idx / n_head;
+    const int head_idx       = head_batch_idx % n_head;
+
+    const ulong record_stride = (ulong) FA_PARTIAL_FLOATS;
+    const ulong record_idx_0  = (((ulong) batch_idx * n_head + head_idx) * n_q + q_idx) * n_splits;
+    const global float * rec0 = partial_void + record_idx_0 * record_stride;
+
+    __local ACC_TYPE m_final_shared;
+    __local ACC_TYPE l_final_shared;
+    if (lane == 0) {
+        ACC_TYPE m = FA_M_INIT;
+        for (int c = 0; c < n_splits; ++c) {
+            const ACC_TYPE m_c = rec0[c * record_stride + 0];
+            m = max(m, m_c);
+        }
+        ACC_TYPE m_sink = 0.0f;
+        bool has_sink = false;
+        if (sinks_void != NULL) {
+            const global ACC_TYPE * sinks_ptr =
+                (const global ACC_TYPE *) ((const global char *) sinks_void + sinks_offset);
+            m_sink = sinks_ptr[head_idx];
+            has_sink = true;
+            m = max(m, m_sink);
+        }
+        ACC_TYPE l = 0.0f;
+        for (int c = 0; c < n_splits; ++c) {
+            const ACC_TYPE m_c = rec0[c * record_stride + 0];
+            const ACC_TYPE l_c = rec0[c * record_stride + 1];
+            if (m_c > FA_M_INIT) {
+                l += l_c * exp(m_c - m);
+            }
+        }
+        if (has_sink) {
+            l += exp(m_sink - m);
+        }
+        m_final_shared = m;
+        l_final_shared = l;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    const ACC_TYPE m_final = m_final_shared;
+    const ACC_TYPE l_final = l_final_shared;
+    const ACC_TYPE l_inv   = (l_final > 0.0f) ? (1.0f / l_final) : 0.0f;
+
+    ACC_TYPE4 o = (ACC_TYPE4)(0.0f);
+    for (int c = 0; c < n_splits; ++c) {
+        const global float * rec_c   = rec0 + c * record_stride;
+        const ACC_TYPE       m_c     = rec_c[0];
+        if (m_c <= FA_M_INIT) continue;
+        const global float4 * rec_oc = (const global float4 *) (rec_c + 2);
+        const ACC_TYPE scale_c = exp(m_c - m_final);
+        o = mad((ACC_TYPE4)(scale_c), rec_oc[lane], o);
+    }
+    o = o * l_inv;
+
+    const ulong o_row_offset = (ulong) batch_idx * o_nb3 + (ulong) q_idx * o_nb2 + (ulong) head_idx * o_nb1;
+    global O_DATA_TYPE4 * o_row = (global O_DATA_TYPE4 *) ((global char *) o_void + o_offset + o_row_offset);
+    o_row[lane] = CONVERT_O_DATA4(o);
+}
@@ -0,0 +1,156 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void flash_attn_kv_pad_f16(
+    const global void * k_void, ulong k_offset,
+    const global void * v_void, ulong v_offset,
+    global void * k_pad_void,
+    global void * v_pad_void,
+    const int n_kv,
+    const int n_head_kv,
+    const int n_batch,
+    const ulong k_nb1, const ulong k_nb2, const ulong k_nb3,
+    const ulong v_nb1, const ulong v_nb2, const ulong v_nb3
+) {
+    const int row_idx = get_global_id(0);
+    const int head_kv_idx = get_global_id(1);
+    const int batch_idx = get_global_id(2);
+
+    if (row_idx >= BLOCK_N || head_kv_idx >= n_head_kv || batch_idx >= n_batch) {
+        return;
+    }
+
+    const int tail_start = n_kv - (n_kv % BLOCK_N);
+    const int src_row_idx = tail_start + row_idx;
+
+    const global char * k_src = (const global char *) k_void + k_offset;
+    const global char * v_src = (const global char *) v_void + v_offset;
+    global char * k_pad = (global char *) k_pad_void;
+    global char * v_pad = (global char *) v_pad_void;
+
+    const ulong k_dst_offset = ((ulong) batch_idx * (ulong) n_head_kv + (ulong) head_kv_idx) * ((ulong) BLOCK_N * k_nb1) + (ulong) row_idx * k_nb1;
+    const ulong v_dst_offset = ((ulong) batch_idx * (ulong) n_head_kv + (ulong) head_kv_idx) * ((ulong) BLOCK_N * v_nb1) + (ulong) row_idx * v_nb1;
+
+    if (src_row_idx < n_kv) {
+        const ulong k_src_offset = (ulong) batch_idx * k_nb3 + (ulong) head_kv_idx * k_nb2 + (ulong) src_row_idx * k_nb1;
+        const ulong v_src_offset = (ulong) batch_idx * v_nb3 + (ulong) head_kv_idx * v_nb2 + (ulong) src_row_idx * v_nb1;
+
+        for (ulong i = 0; i < k_nb1; ++i) {
+            k_pad[k_dst_offset + i] = k_src[k_src_offset + i];
+        }
+        for (ulong i = 0; i < v_nb1; ++i) {
+            v_pad[v_dst_offset + i] = v_src[v_src_offset + i];
+        }
+    } else {
+        for (ulong i = 0; i < k_nb1; ++i) {
+            k_pad[k_dst_offset + i] = 0;
+        }
+        for (ulong i = 0; i < v_nb1; ++i) {
+            v_pad[v_dst_offset + i] = 0;
+        }
+    }
+}
+
+__kernel void flash_attn_mask_pad_f16(
+    const global void * mask_void, ulong mask_offset,
+    global void * mask_pad_void,
+    const int n_q,
+    const int n_kv,
+    const ulong mask_nb1,
+    const ulong mask_nb2,
+    const ulong mask_nb3,
+    const int mask_ne2,
+    const int mask_ne3
+) {
+    const int col_idx = get_global_id(0);
+    const int q_row = get_global_id(1);
+    const int mask_slice = get_global_id(2);
+
+    if (col_idx >= BLOCK_N || q_row >= n_q || mask_slice >= mask_ne2 * mask_ne3) {
+        return;
+    }
+
+    const int tail_start = n_kv - (n_kv % BLOCK_N);
+    const int src_col_idx = tail_start + col_idx;
+    const int mask_head_idx = mask_slice % mask_ne2;
+    const int mask_batch_idx = mask_slice / mask_ne2;
+
+    const global char * mask_src_base = (const global char *) mask_void + mask_offset +
+        (ulong) mask_batch_idx * mask_nb3 +
+        (ulong) mask_head_idx * mask_nb2 +
+        (ulong) q_row * mask_nb1;
+    const global half * mask_src = (const global half *) mask_src_base;
+
+    global half * mask_pad = (global half *) mask_pad_void;
+    const ulong dst_idx =
+        (((ulong) mask_batch_idx * (ulong) mask_ne2 + (ulong) mask_head_idx) * (ulong) n_q + (ulong) q_row) * (ulong) BLOCK_N +
+        (ulong) col_idx;
+
+    mask_pad[dst_idx] = src_col_idx < n_kv ? mask_src[src_col_idx] : (half) (-INFINITY);
+}
+
+// Per-KV-tile mask class. 0=all -inf (skip tile), 1=mixed (apply mask),
+// 2=all zero, no -inf (skip mask lookup). Causal diagonal tiles are class 1.
+__kernel void flash_attn_blk_f16(
+    const global void * mask_void, ulong mask_offset,
+    global char * blk,
+    const int n_q,
+    const int n_kv,
+    const ulong mask_nb1,
+    const ulong mask_nb2,
+    const ulong mask_nb3,
+    const int mask_ne2,
+    const int mask_ne3
+) {
+    const int kv_block_idx = get_global_id(0);
+    const int q_block_idx = get_global_id(1);
+    const int mask_slice = get_global_id(2);
+
+    const int n_q_blocks = (n_q + BLOCK_M - 1) / BLOCK_M;
+    const int n_kv_blocks = (n_kv + BLOCK_N - 1) / BLOCK_N;
+    if (kv_block_idx >= n_kv_blocks || q_block_idx >= n_q_blocks || mask_slice >= mask_ne2 * mask_ne3) {
+        return;
+    }
+
+    const int mask_head_idx = mask_slice % mask_ne2;
+    const int mask_batch_idx = mask_slice / mask_ne2;
+    const int q_start = q_block_idx * BLOCK_M;
+    const int k_start = kv_block_idx * BLOCK_N;
+    const int q_count = min(BLOCK_M, n_q - q_start);
+    const int k_count = min(BLOCK_N, n_kv - k_start);
+
+    const half neg_max_half = (half) (-65504.0f);
+    char has_unmasked = 0;
+    char has_masked = 0;
+    char has_nonzero = 0;
+
+    const global char * mask_base = (const global char *) mask_void + mask_offset +
+        (ulong) mask_batch_idx * mask_nb3 +
+        (ulong) mask_head_idx * mask_nb2;
+
+    for (int qi = 0; qi < q_count; ++qi) {
+        const global half * mask_row = (const global half *) (mask_base + (ulong) (q_start + qi) * mask_nb1) + k_start;
+        for (int ki = 0; ki < k_count; ++ki) {
+            const half v = mask_row[ki];
+            if (v <= neg_max_half) {
+                has_masked = 1;
+            } else {
+                has_unmasked = 1;
+                if (v != (half) 0.0f) {
+                    has_nonzero = 1;
+                }
+            }
+        }
+        if (has_masked && has_unmasked) break;  // mixed tile — short-circuit.
+    }
+
+    char res;
+    if (has_unmasked == 0) {
+        res = 0;
+    } else if (has_masked || has_nonzero) {
+        res = 1;
+    } else {
+        res = 2;
+    }
+
+    blk[((ulong) mask_slice * (ulong) n_q_blocks + (ulong) q_block_idx) * (ulong) n_kv_blocks + (ulong) kv_block_idx] = res;
+}
@@ -158,6 +158,239 @@ kernel void kernel_set_rows_f32_i32(
    }
 }

+// f32 -> q8_0 quantize set_rows. Block = half d + char qs[32].
+#define QK8_0 32
+
+inline void quantize_q8_0_block(global float * x, global char * qs, global half * d_out) {
+    float amax = 0.0f;
+    for (int j = 0; j < QK8_0; j++) {
+        amax = fmax(amax, fabs(x[j]));
+    }
+
+    float d  = amax / 127.0f;
+    float id = (d != 0.0f) ? 127.0f / amax : 0.0f;
+
+    vstore_half(d, 0, d_out);
+
+    for (int j = 0; j < QK8_0; j++) {
+        qs[j] = (char)((int)round(x[j] * id));
+    }
+}
+
+kernel void kernel_set_rows_q8_0_i64(
+        global char * src0,
+        ulong         offset0,
+        global char * src1,
+        ulong         offset1,
+        global char * dst,
+        ulong         offsetd,
+        int           ne01,
+        ulong         nb01,
+        ulong         nb02,
+        ulong         nb03,
+        uint4         ne11,
+        uint4         ne12,
+        ulong         nb10,
+        ulong         nb11,
+        ulong         nb12,
+        int           nblk0,
+        ulong         nb1,
+        ulong         nb2,
+        ulong         nb3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst  + offsetd;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
+
+    int i10 = i01;
+    long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
+
+    global char  * dst_row = (global char  *) (dst  +  i1*nb1  + i02*nb2  + i03*nb3);
+    global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
+
+    for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
+        global float * x = src_row + blk * QK8_0;
+        global char  * y = dst_row + blk * (2 + QK8_0);
+
+        quantize_q8_0_block(x, y + 2, (global half *)y);
+    }
+}
+
+kernel void kernel_set_rows_q8_0_i32(
+        global char * src0,
+        ulong         offset0,
+        global char * src1,
+        ulong         offset1,
+        global char * dst,
+        ulong         offsetd,
+        int           ne01,
+        ulong         nb01,
+        ulong         nb02,
+        ulong         nb03,
+        uint4         ne11,
+        uint4         ne12,
+        ulong         nb10,
+        ulong         nb11,
+        ulong         nb12,
+        int           nblk0,
+        ulong         nb1,
+        ulong         nb2,
+        ulong         nb3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst  + offsetd;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
+
+    int i10 = i01;
+    int i1  = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
+
+    global char  * dst_row = (global char  *) (dst  +  i1*nb1  + i02*nb2  + i03*nb3);
+    global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
+
+    for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
+        global float * x = src_row + blk * QK8_0;
+        global char  * y = dst_row + blk * (2 + QK8_0);
+
+        quantize_q8_0_block(x, y + 2, (global half *)y);
+    }
+}
+
+// SoA q8_0 variants. dst_q: int8[QK8_0] per block; dst_d: fp16 scale per block.
+// Layout matches kernel_convert_block_q8_0; block index follows dst element order.
+kernel void kernel_set_rows_q8_0_soa_i64(
+        global char * src0,
+        ulong         offset0,
+        global char * src1,
+        ulong         offset1,
+        global char * dst_q,
+        ulong         offset_q,
+        global char * dst_d,
+        ulong         offset_d,
+        int           ne01,
+        ulong         nb01,
+        ulong         nb02,
+        ulong         nb03,
+        uint4         ne11,
+        uint4         ne12,
+        ulong         nb10,
+        ulong         nb11,
+        ulong         nb12,
+        int           nblk0,
+        int           ne1_dst,
+        int           ne2_dst,
+        int           ne3_dst
+) {
+    src0  = src0  + offset0;
+    src1  = src1  + offset1;
+    dst_q = dst_q + offset_q;
+    dst_d = dst_d + offset_d;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
+
+    int i10 = i01;
+    long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
+
+    long row_blk_base = ((long)i03 * ne2_dst * ne1_dst + (long)i02 * ne1_dst + i1) * nblk0;
+
+    global half  * d_row = (global half  *)(dst_d) + row_blk_base;
+    global char  * q_row = (global char  *)(dst_q) + row_blk_base * QK8_0;
+    global float * src_row = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
+
+    for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
+        global float * x = src_row + blk * QK8_0;
+        global char  * q = q_row + blk * QK8_0;
+
+        quantize_q8_0_block(x, q, d_row + blk);
+    }
+}
+
+kernel void kernel_set_rows_q8_0_soa_i32(
+        global char * src0,
+        ulong         offset0,
+        global char * src1,
+        ulong         offset1,
+        global char * dst_q,
+        ulong         offset_q,
+        global char * dst_d,
+        ulong         offset_d,
+        int           ne01,
+        ulong         nb01,
+        ulong         nb02,
+        ulong         nb03,
+        uint4         ne11,
+        uint4         ne12,
+        ulong         nb10,
+        ulong         nb11,
+        ulong         nb12,
+        int           nblk0,
+        int           ne1_dst,
+        int           ne2_dst,
+        int           ne3_dst
+) {
+    src0  = src0  + offset0;
+    src1  = src1  + offset1;
+    dst_q = dst_q + offset_q;
+    dst_d = dst_d + offset_d;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
+
+    int i10 = i01;
+    int i1  = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
+
+    long row_blk_base = ((long)i03 * ne2_dst * ne1_dst + (long)i02 * ne1_dst + i1) * nblk0;
+
+    global half  * d_row = (global half  *)(dst_d) + row_blk_base;
+    global char  * q_row = (global char  *)(dst_q) + row_blk_base * QK8_0;
+    global float * src_row = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
+
+    for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
+        global float * x = src_row + blk * QK8_0;
+        global char  * q = q_row + blk * QK8_0;
+
+        quantize_q8_0_block(x, q, d_row + blk);
+    }
+}
+
 kernel void kernel_set_rows_f16_i32(
        global char * src0,
        ulong         offset0,
@@ -206,3 +439,270 @@ kernel void kernel_set_rows_f16_i32(
        dst_row[ind] = src_row[ind];
    }
 }
+
+// f32 -> q4_0 quantize set_rows. Block = half d + uchar qs[16] (shuffled
+// nibbles: qs[j] low/high = elem j / j+16).
+// Dequant: val[i] = d * (nibble_i - 8)
+// nblk0 = number of q4_0 blocks per row = ne00 / 32.
+#define QK4_0 32
+#define Q4_0_BLOCK_SIZE 18
+
+inline void quantize_q4_0_block(global float * x, global uchar * qs, global half * d_out) {
+    // Find the signed value with the largest absolute magnitude (matches ggml ref).
+    float max  = 0.0f;
+    float amax = 0.0f;
+    for (int j = 0; j < QK4_0; j++) {
+        float v = x[j];
+        float a = fabs(v);
+        if (a > amax) {
+            amax = a;
+            max  = v;
+        }
+    }
+
+    float d  = max / -8.0f;
+    float id = (d != 0.0f) ? 1.0f / d : 0.0f;
+
+    vstore_half(d, 0, d_out);
+
+    for (int j = 0; j < QK4_0/2; j++) {
+        float x0 = x[j]           * id;
+        float x1 = x[j + QK4_0/2] * id;
+
+        int i0 = (int)(x0 + 8.5f);
+        int i1 = (int)(x1 + 8.5f);
+        if (i0 < 0)  i0 = 0;
+        if (i0 > 15) i0 = 15;
+        if (i1 < 0)  i1 = 0;
+        if (i1 > 15) i1 = 15;
+
+        qs[j] = (uchar)i0 | ((uchar)i1 << 4);
+    }
+}
+
+kernel void kernel_set_rows_q4_0_i64(
+        global char * src0,
+        ulong         offset0,
+        global char * src1,
+        ulong         offset1,
+        global char * dst,
+        ulong         offsetd,
+        int           ne01,
+        ulong         nb01,
+        ulong         nb02,
+        ulong         nb03,
+        uint4         ne11,
+        uint4         ne12,
+        ulong         nb10,
+        ulong         nb11,
+        ulong         nb12,
+        int           nblk0,
+        ulong         nb1,
+        ulong         nb2,
+        ulong         nb3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst  + offsetd;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
+
+    int i10 = i01;
+    long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
+
+    global char  * dst_row = (global char  *) (dst  +  i1*nb1  + i02*nb2  + i03*nb3);
+    global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
+
+    for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
+        global float * x    = src_row + blk * QK4_0;
+        global char  * y    = dst_row + blk * Q4_0_BLOCK_SIZE;
+        global half  * yd   = (global half  *)(y);
+        global uchar * yqs  = (global uchar *)(y + 2);
+
+        quantize_q4_0_block(x, yqs, yd);
+    }
+}
+
+kernel void kernel_set_rows_q4_0_i32(
+        global char * src0,
+        ulong         offset0,
+        global char * src1,
+        ulong         offset1,
+        global char * dst,
+        ulong         offsetd,
+        int           ne01,
+        ulong         nb01,
+        ulong         nb02,
+        ulong         nb03,
+        uint4         ne11,
+        uint4         ne12,
+        ulong         nb10,
+        ulong         nb11,
+        ulong         nb12,
+        int           nblk0,
+        ulong         nb1,
+        ulong         nb2,
+        ulong         nb3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst  + offsetd;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
+
+    int i10 = i01;
+    int i1  = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
+
+    global char  * dst_row = (global char  *) (dst  +  i1*nb1  + i02*nb2  + i03*nb3);
+    global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
+
+    for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
+        global float * x    = src_row + blk * QK4_0;
+        global char  * y    = dst_row + blk * Q4_0_BLOCK_SIZE;
+        global half  * yd   = (global half  *)(y);
+        global uchar * yqs  = (global uchar *)(y + 2);
+
+        quantize_q4_0_block(x, yqs, yd);
+    }
+}
+
+// SoA variants for q4_0 dst. Used when the backend has split block_q4_0 records
+// into separate quant (dst_q) and scale (dst_d) sub-buffers — same pattern as
+// the q8_0 SoA variants above.
+//
+// Layout (matches kernel_convert_block_q4_0, the "shuffled" variant):
+//   dst_q: contiguous 16 packed nibbles per block, block i at offset i * 16 bytes.
+//   dst_d: contiguous fp16 scales, block i at offset i * 2 bytes.
+// Nibble layout inside each byte is unchanged from AoS: qs[j] low nibble = element j,
+// qs[j] high nibble = element j+16. kernel_restore_block_q4_0 copies bytes as-is.
+kernel void kernel_set_rows_q4_0_soa_i64(
+        global char * src0,
+        ulong         offset0,
+        global char * src1,
+        ulong         offset1,
+        global char * dst_q,
+        ulong         offset_q,
+        global char * dst_d,
+        ulong         offset_d,
+        int           ne01,
+        ulong         nb01,
+        ulong         nb02,
+        ulong         nb03,
+        uint4         ne11,
+        uint4         ne12,
+        ulong         nb10,
+        ulong         nb11,
+        ulong         nb12,
+        int           nblk0,
+        int           ne1_dst,
+        int           ne2_dst,
+        int           ne3_dst
+) {
+    src0  = src0  + offset0;
+    src1  = src1  + offset1;
+    dst_q = dst_q + offset_q;
+    dst_d = dst_d + offset_d;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
+
+    int i10 = i01;
+    long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
+
+    long row_blk_base = ((long)i03 * ne2_dst * ne1_dst + (long)i02 * ne1_dst + i1) * nblk0;
+
+    global half  * d_row   = (global half  *)(dst_d) + row_blk_base;
+    global uchar * q_row   = (global uchar *)(dst_q) + row_blk_base * (QK4_0/2);
+    global float * src_row = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
+
+    for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
+        global float * x    = src_row + blk * QK4_0;
+        global uchar * qs   = q_row   + blk * (QK4_0/2);
+        global half  * d_bk = d_row   + blk;
+
+        quantize_q4_0_block(x, qs, d_bk);
+    }
+}
+
+kernel void kernel_set_rows_q4_0_soa_i32(
+        global char * src0,
+        ulong         offset0,
+        global char * src1,
+        ulong         offset1,
+        global char * dst_q,
+        ulong         offset_q,
+        global char * dst_d,
+        ulong         offset_d,
+        int           ne01,
+        ulong         nb01,
+        ulong         nb02,
+        ulong         nb03,
+        uint4         ne11,
+        uint4         ne12,
+        ulong         nb10,
+        ulong         nb11,
+        ulong         nb12,
+        int           nblk0,
+        int           ne1_dst,
+        int           ne2_dst,
+        int           ne3_dst
+) {
+    src0  = src0  + offset0;
+    src1  = src1  + offset1;
+    dst_q = dst_q + offset_q;
+    dst_d = dst_d + offset_d;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
+
+    int i10 = i01;
+    int i1  = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
+
+    long row_blk_base = ((long)i03 * ne2_dst * ne1_dst + (long)i02 * ne1_dst + i1) * nblk0;
+
+    global half  * d_row   = (global half  *)(dst_d) + row_blk_base;
+    global uchar * q_row   = (global uchar *)(dst_q) + row_blk_base * (QK4_0/2);
+    global float * src_row = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
+
+    for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
+        global float * x    = src_row + blk * QK4_0;
+        global uchar * qs   = q_row   + blk * (QK4_0/2);
+        global half  * d_bk = d_row   + blk;
+
+        quantize_q4_0_block(x, qs, d_bk);
+    }
+}
@@ -1270,77 +1270,14 @@ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecode
 }

 std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
-    static const std::map<ggml_op, std::string> ops = {
-        {GGML_OP_NONE,            "GGML_OP_NONE"           },
-        {GGML_OP_ACC,             "GGML_OP_ACC"            },
-        {GGML_OP_ADD,             "GGML_OP_ADD"            },
-        {GGML_OP_ADD1,            "GGML_OP_ADD1"           },
-        {GGML_OP_ADD_ID,          "GGML_OP_ADD_ID"         },
-        {GGML_OP_CONCAT,          "GGML_OP_CONCAT"         },
-        {GGML_OP_CONT,            "GGML_OP_CONT"           },
-        {GGML_OP_DIV,             "GGML_OP_DIV"            },
-        {GGML_OP_DUP,             "GGML_OP_DUP"            },
-        {GGML_OP_GET_ROWS,        "GGML_OP_GET_ROWS"       },
-        {GGML_OP_MUL,             "GGML_OP_MUL"            },
-        {GGML_OP_MUL_MAT,         "GGML_OP_MUL_MAT"        },
-        {GGML_OP_MUL_MAT_ID,      "GGML_OP_MUL_MAT_ID"     },
-        {GGML_OP_PERMUTE,         "GGML_OP_PERMUTE"        },
-        {GGML_OP_RESHAPE,         "GGML_OP_RESHAPE"        },
-        {GGML_OP_RMS_NORM,        "GGML_OP_RMS_NORM"       },
-        {GGML_OP_NORM,            "GGML_OP_NORM"           },
-        {GGML_OP_ROPE,            "GGML_OP_ROPE"           },
-        {GGML_OP_SCALE,           "GGML_OP_SCALE"          },
-        {GGML_OP_SOFT_MAX,        "GGML_OP_SOFT_MAX"       },
-        {GGML_OP_SUM_ROWS,        "GGML_OP_SUM_ROWS"       },
-        {GGML_OP_SUB,             "GGML_OP_SUB"            },
-        {GGML_OP_TRANSPOSE,       "GGML_OP_TRANSPOSE"      },
-        {GGML_OP_VIEW,            "GGML_OP_VIEW"           },
-        {GGML_OP_SET_ROWS,        "GGML_OP_SET_ROWS"       },
-        {GGML_OP_CPY,             "GGML_OP_CPY"            },
-        {GGML_OP_FLASH_ATTN_EXT,  "GGML_OP_FLASH_ATTN_EXT" },
-        {GGML_OP_L2_NORM,         "GGML_OP_L2_NORM"        },
-        {GGML_OP_CLAMP,           "GGML_OP_CLAMP"          },
-        {GGML_OP_PAD,             "GGML_OP_PAD"            },
-        {GGML_OP_SSM_CONV,        "GGML_OP_SSM_CONV"       },
-        {GGML_OP_GATED_DELTA_NET, "GGML_OP_GATED_DELTA_NET"},
-        {GGML_OP_ARGSORT,         "GGML_OP_ARGSORT"        },
-        {GGML_OP_REPEAT,          "GGML_OP_REPEAT"         },
-        {GGML_OP_IM2COL,          "GGML_OP_IM2COL"         }
-    };
-    static const std::map<ggml_unary_op, std::string> unary_ops = {
-        {GGML_UNARY_OP_ABS,         "GGML_UNARY_OP_ABS"        },
-        {GGML_UNARY_OP_SGN,         "GGML_UNARY_OP_SGN"        },
-        {GGML_UNARY_OP_NEG,         "GGML_UNARY_OP_NEG"        },
-        {GGML_UNARY_OP_STEP,        "GGML_UNARY_OP_STEP"       },
-        {GGML_UNARY_OP_TANH,        "GGML_UNARY_OP_TANH"       },
-        {GGML_UNARY_OP_ELU,         "GGML_UNARY_OP_ELU"        },
-        {GGML_UNARY_OP_RELU,        "GGML_UNARY_OP_RELU"       },
-        {GGML_UNARY_OP_SIGMOID,     "GGML_UNARY_OP_SIGMOID"    },
-        {GGML_UNARY_OP_GELU,        "GGML_UNARY_OP_GELU"       },
-        {GGML_UNARY_OP_GELU_QUICK,  "GGML_UNARY_OP_GELU_QUICK" },
-        {GGML_UNARY_OP_SILU,        "GGML_UNARY_OP_SILU"       },
-        {GGML_UNARY_OP_SOFTPLUS,    "GGML_UNARY_OP_SOFTPLUS"   },
-        {GGML_UNARY_OP_HARDSWISH,   "GGML_UNARY_OP_HARDSWISH"  },
-        {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
-        {GGML_UNARY_OP_EXP,         "GGML_UNARY_OP_EXP"        },
-        {GGML_UNARY_OP_COUNT,       "GGML_UNARY_OP_COUNT"      }
-    };
-    static const std::map<ggml_glu_op, std::string> glu_ops = {
-        {GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
-        {GGML_GLU_OP_GEGLU,  "GGML_GLU_OP_GEGLU" },
-        {GGML_GLU_OP_REGLU,  "GGML_GLU_OP_REGLU" }
-    };
-
    switch (node->op) {
    case GGML_OP_UNARY:
-        return unary_ops.at(ggml_get_unary_op(node));
+        return std::string("GGML_UNARY_OP_") + ggml_unary_op_name(ggml_get_unary_op(node));
    case GGML_OP_GLU:
-        return glu_ops.at(ggml_get_glu_op(node));
+        return std::string("GGML_GLU_OP_") + ggml_glu_op_name(ggml_get_glu_op(node));
    default:
-        return ops.at(node->op);
+        return std::string("GGML_OP_") + ggml_op_name(node->op);
    }
-    static const std::string unknown_op = "UNKNOWN_GGML_OP";
-    return unknown_op;
 }

 const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
@@ -1053,6 +1053,10 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
            (op->ne[0] == 2 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2)) {
            return true;
        }
+        // CPY into a strided view of a larger buffer (recurrent-state snapshots) not supported
+        if (op->view_src && ggml_nbytes(op) != ggml_nbytes(op->view_src)) {
+            return true;
+        }
        break;
    }
    case GGML_OP_MUL_MAT: {
@@ -17,6 +17,22 @@ namespace frontend {
 namespace ggml {
 namespace op {

+static ov::Output<ov::Node> reshape_add_id_input_to_2d(const ov::Output<ov::Node> & input,
+                                                       const ov::PartialShape & input_shape,
+                                                       const std::vector<int> & dims) {
+    const auto actual_shape = input.get_partial_shape();
+    if (actual_shape.rank().is_static() && actual_shape.rank().get_length() == 2) {
+        return input;
+    }
+
+    if (input_shape.rank().is_static() && input_shape.rank().get_length() == 2) {
+        return input;
+    }
+
+    auto shape = std::make_shared<ov::op::v3::ShapeOf>(input, ov::element::i64);
+    return std::make_shared<ov::op::v1::Reshape>(input, get_dimensions(shape, dims), false);
+}
+
 OutputVector translate_add_id(const NodeContext & context) {
    num_inputs_check(context, 3, 3);

@@ -28,11 +44,9 @@ OutputVector translate_add_id(const NodeContext & context) {
    //   input: [1, n_token, n_used, n_embd]
    //   bias:  [1, 1, n_expert, n_embd]
    //   ids:   [1, 1, n_token, n_used]
-    auto bias_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(bias, ov::element::i64);
-    auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
-
-    bias = std::make_shared<ov::op::v1::Reshape>(bias, get_dimensions(bias_shape_4d, {2, 3}), false);
-    ids = std::make_shared<ov::op::v1::Reshape>(ids, get_dimensions(ids_shape_4d, {2, 3}), false);
+    // Model bias constants may already be stored as [n_expert, n_embd].
+    bias = reshape_add_id_input_to_2d(bias, context.get_input_shape(1), {2, 3});
+    ids = reshape_add_id_input_to_2d(ids, context.get_input_shape(2), {2, 3});

    if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
        ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);
@@ -3,8 +3,11 @@
 #include "../utils.h"

 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <openvino/core/node_output.hpp>
+#include <openvino/op/add.hpp>
+#include <openvino/op/clamp.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/multiply.hpp>
 #include <openvino/op/sigmoid.hpp>
@@ -15,7 +18,7 @@ namespace frontend {
 namespace ggml {
 namespace op {

-OutputVector translate_glu_swiglu(const NodeContext & context) {
+static std::pair<ov::Output<ov::Node>, ov::Output<ov::Node>> get_glu_inputs(const NodeContext & context) {
    num_inputs_check(context, 1, 2);

    ov::Output<ov::Node> src0;
@@ -52,6 +55,12 @@ OutputVector translate_glu_swiglu(const NodeContext & context) {
        std::swap(src0, src1);
    }

+    return {src0, src1};
+}
+
+OutputVector translate_glu_swiglu(const NodeContext & context) {
+    auto [src0, src1] = get_glu_inputs(context);
+
    auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(src0);
    auto silu = std::make_shared<ov::op::v1::Multiply>(src0, sigmoid);
    auto res = std::make_shared<ov::op::v1::Multiply>(silu, src1);
@@ -59,6 +68,27 @@ OutputVector translate_glu_swiglu(const NodeContext & context) {
    return rename_outputs_with_suffix({res}, context.get_name());
 }

+OutputVector translate_glu_swiglu_oai(const NodeContext & context) {
+    auto [src0, src1] = get_glu_inputs(context);
+
+    const int32_t * params = context.get_output_op_params();
+    const float alpha = reinterpret_cast<const float *>(params)[2];
+    const float limit = reinterpret_cast<const float *>(params)[3];
+
+    auto gate = std::make_shared<ov::op::v0::Clamp>(src0, -std::numeric_limits<float>::infinity(), limit);
+    auto alpha_const = ov::op::v0::Constant::create(ov::element::f32, {}, {alpha});
+    auto scaled_gate = std::make_shared<ov::op::v1::Multiply>(gate, alpha_const);
+    auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(scaled_gate);
+    auto out_glu = std::make_shared<ov::op::v1::Multiply>(gate, sigmoid);
+
+    auto up = std::make_shared<ov::op::v0::Clamp>(src1, -limit, limit);
+    auto one = ov::op::v0::Constant::create(ov::element::f32, {}, {1.0f});
+    auto up_plus_one = std::make_shared<ov::op::v1::Add>(up, one);
+    auto res = std::make_shared<ov::op::v1::Multiply>(out_glu, up_plus_one);
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
 }  // namespace op
 }  // namespace ggml
 }  // namespace frontend
@@ -2,23 +2,135 @@
 #include "../op_table.h"
 #include "../utils.h"

+#include <cstdint>
+#include <cstring>
+#include <limits>
 #include <memory>
+#include <openvino/op/bitwise_and.hpp>
+#include <openvino/op/bitwise_right_shift.hpp>
 #include <openvino/op/broadcast.hpp>
 #include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/gather.hpp>
 #include <openvino/op/matmul.hpp>
+#include <openvino/op/multiply.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/shape_of.hpp>
-#include <openvino/op/squeeze.hpp>
+#include <openvino/op/slice.hpp>
 #include <openvino/op/unsqueeze.hpp>
+#include <vector>

 namespace ov {
 namespace frontend {
 namespace ggml {
 namespace op {

+namespace {
+
+std::shared_ptr<ov::op::v0::Constant> const_i64(const std::vector<int64_t> & values) {
+    return ov::op::v0::Constant::create(ov::element::i64, ov::Shape{values.size()}, values);
+}
+
+ov::Output<ov::Node> slice_axis(const ov::Output<ov::Node> & input, int64_t axis, int64_t begin, int64_t end) {
+    return std::make_shared<ov::op::v8::Slice>(input, const_i64({begin}), const_i64({end}), const_i64({1}),
+                                              const_i64({axis}));
+}
+
+ov::Output<ov::Node> translate_mul_mat_id_mxfp4_packed(const NodeContext & context,
+                                                       ov::Output<ov::Node> expert_weights,
+                                                       ov::Output<ov::Node> activations,
+                                                       ov::Output<ov::Node> ids) {
+    auto packed_shape = expert_weights.get_partial_shape().to_shape();
+    FRONT_END_OP_CONVERSION_CHECK(packed_shape.size() == 5 && packed_shape[4] == 17,
+                                  "Expected packed MXFP4 expert weights with shape [1, n_expert, m, k_blocks, 17]");
+
+    const int64_t n_expert = static_cast<int64_t>(packed_shape[1]);
+    const int64_t rows = static_cast<int64_t>(packed_shape[2]);
+    const int64_t k_blocks = static_cast<int64_t>(packed_shape[3]);
+    const int64_t qk = 32;
+    const int64_t cols = k_blocks * qk;
+
+    auto packed_shape_4d = const_i64({n_expert, rows, k_blocks, 17});
+    expert_weights = std::make_shared<ov::op::v1::Reshape>(expert_weights, packed_shape_4d, false);
+
+    auto activations_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(activations, ov::element::i64);
+    auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
+    auto activations_shape_3d = get_dimensions(activations_shape_4d, {1, 2, 3});
+    auto ids_shape_2d = get_dimensions(ids_shape_4d, {2, 3});
+
+    activations = std::make_shared<ov::op::v1::Reshape>(activations, activations_shape_3d, false);
+    ids = std::make_shared<ov::op::v1::Reshape>(ids, ids_shape_2d, false);
+    if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
+        ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);
+    }
+
+    auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
+
+    static const std::vector<float> f4e2m1_lut = {0.0f,  0.5f,  1.0f,  1.5f,  2.0f,  3.0f,  4.0f,  6.0f,
+                                                  -0.0f, -0.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f};
+    std::vector<float> e8m0_lut(256);
+    for (size_t i = 0; i < e8m0_lut.size(); ++i) {
+        uint32_t bits = static_cast<uint32_t>(i) << 23;
+        memcpy(&e8m0_lut[i], &bits, sizeof(float));
+    }
+    e8m0_lut[0] = std::numeric_limits<float>::min() / 2.0f;
+    e8m0_lut[255] = std::numeric_limits<float>::quiet_NaN();
+
+    auto f4_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{f4e2m1_lut.size()}, f4e2m1_lut);
+    auto scale_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{e8m0_lut.size()}, e8m0_lut);
+
+    auto selected_packed_weights = std::make_shared<ov::op::v8::Gather>(expert_weights, ids, gather_axis);
+    auto scale_byte = slice_axis(selected_packed_weights, 4, 0, 1);
+    auto qs = slice_axis(selected_packed_weights, 4, 1, 17);
+    auto low = std::make_shared<ov::op::v13::BitwiseAnd>(
+        qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {0x0F}), ov::op::AutoBroadcastType::NUMPY);
+    auto high_shift = std::make_shared<ov::op::v15::BitwiseRightShift>(
+        qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {4}), ov::op::AutoBroadcastType::NUMPY);
+    auto nibbles = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{low, high_shift}, 4);
+    auto nibble_indices = std::make_shared<ov::op::v0::Convert>(nibbles, ov::element::i32);
+    auto weights_f32 = std::make_shared<ov::op::v8::Gather>(f4_lut, nibble_indices, gather_axis);
+
+    auto scale_indices = std::make_shared<ov::op::v0::Convert>(scale_byte, ov::element::i32);
+    auto scales_f32 = std::make_shared<ov::op::v8::Gather>(scale_lut, scale_indices, gather_axis);
+    ov::Output<ov::Node> selected_weights = std::make_shared<ov::op::v1::Multiply>(weights_f32, scales_f32,
+                                                                                  ov::op::AutoBroadcastType::NUMPY);
+
+    auto ids_shape = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
+    auto selected_weights_target_dims = std::make_shared<ov::op::v0::Concat>(
+        ov::OutputVector{get_dimensions(ids_shape, {0, 1}), const_i64({rows, cols})}, 0);
+    selected_weights = std::make_shared<ov::op::v1::Reshape>(selected_weights, selected_weights_target_dims, false);
+
+    auto activations_shape = std::make_shared<ov::op::v3::ShapeOf>(activations, ov::element::i64);
+    ov::Output<ov::Node> acts_target_dims = std::make_shared<ov::op::v0::Concat>(
+        ov::OutputVector{
+            get_dimensions(activations_shape, {0}),
+            get_dimensions(ids_shape, {1}),
+            get_dimensions(activations_shape, {2}),
+        },
+        0);
+    ov::Output<ov::Node> acts_broadcasted =
+        std::make_shared<ov::op::v3::Broadcast>(activations, acts_target_dims, ov::op::BroadcastType::BIDIRECTIONAL);
+
+    auto activations_expanded = std::make_shared<ov::op::v0::Unsqueeze>(acts_broadcasted, const_i64({2}));
+    ov::Output<ov::Node> result =
+        std::make_shared<ov::op::v0::MatMul>(activations_expanded, selected_weights, false, true);
+
+    auto batch_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+    auto row_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {rows});
+    auto result_target_dims = std::make_shared<ov::op::v0::Concat>(
+        ov::OutputVector{batch_dim, get_dimensions(ids_shape, {0, 1}), row_dim}, 0);
+    result = std::make_shared<ov::op::v1::Reshape>(result, result_target_dims, false);
+
+    const auto output_type = context.get_output_type();
+    if (result.get_element_type() != output_type) {
+        result = std::make_shared<ov::op::v0::Convert>(result, output_type);
+    }
+    return result;
+}
+
+}  // namespace
+
 OutputVector translate_mul_mat_id(const NodeContext & context) {
    num_inputs_check(context, 3, 3);

@@ -26,6 +138,12 @@ OutputVector translate_mul_mat_id(const NodeContext & context) {
    auto activations = process_view_input_new(context, 1);
    auto ids = process_view_input_new(context, 2);

+    if (expert_weights.get_element_type() == ov::element::u8 && expert_weights.get_partial_shape().rank().is_static() &&
+        expert_weights.get_partial_shape().rank().get_length() == 5) {
+        return rename_outputs_with_suffix({translate_mul_mat_id_mxfp4_packed(context, expert_weights, activations, ids)},
+                                          context.get_name());
+    }
+
    // OpenVINO sees GGML tensors in reversed dimension order:
    //   weights: [1, n_expert, m, k]
    //   activations: [1, n_tokens, n_used_or_1, k]
@@ -6,12 +6,16 @@
 #include <cstdint>
 #include <cstring>
 #include <memory>
+#include <openvino/op/broadcast.hpp>
 #include <openvino/frontend/exception.hpp>
 #include <openvino/op/add.hpp>
+#include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/multiply.hpp>
 #include <openvino/op/reshape.hpp>
+#include <openvino/op/shape_of.hpp>
+#include <openvino/op/slice.hpp>
 #include <openvino/op/softmax.hpp>
 #include <vector>

@@ -20,12 +24,31 @@ namespace frontend {
 namespace ggml {
 namespace op {

+static bool is_static_one(const ov::Dimension & dim) {
+    return dim.is_static() && dim.get_length() == 1;
+}
+
+static bool same_static_dim(const ov::Dimension & lhs, const ov::Dimension & rhs) {
+    return lhs.is_static() && rhs.is_static() && lhs.get_length() == rhs.get_length();
+}
+
+static bool is_attention_sinks_input_shape(const ov::PartialShape & candidate, const ov::PartialShape & logits_shape) {
+    if (candidate.rank().is_dynamic() || logits_shape.rank().is_dynamic() || candidate.rank().get_length() != 4 ||
+        logits_shape.rank().get_length() != 4) {
+        return false;
+    }
+
+    return is_static_one(candidate[0]) && is_static_one(candidate[1]) && is_static_one(candidate[2]) &&
+           same_static_dim(candidate[3], logits_shape[1]);
+}
+
 // Reimplementation of GGML_OP_SOFT_MAX semantics for OpenVINO backend:
 // 1) logits = src0 * scale
 // 2) logits += mask (if provided)
-// 3) softmax over the last dimension
+// 3) append attention sinks as hidden logits (if provided)
+// 4) softmax over the last dimension and remove the hidden sink column
 OutputVector translate_soft_max(const NodeContext & context) {
-    num_inputs_check(context, 1, 2);
+    num_inputs_check(context, 1, 3);

    float scale = 1.0f;
    float max_bias = 0.0f;
@@ -33,6 +56,11 @@ OutputVector translate_soft_max(const NodeContext & context) {
    memcpy(&max_bias, (float *) context.get_output_op_params() + 1, sizeof(float));

    ov::Output<ov::Node> logits = context.get_input(0);
+    const bool second_input_is_sinks =
+        context.get_input_size() == 2 && is_attention_sinks_input_shape(context.get_input_shape(1), context.get_output_shape());
+    const bool has_mask = context.get_input_size() > 1 && !second_input_is_sinks;
+    const bool has_sinks = second_input_is_sinks || context.get_input_size() > 2;
+    const size_t sinks_input_idx = second_input_is_sinks ? 1 : 2;

    // Apply scale first: logits = src0 * scale
    if (scale != 1.0f) {
@@ -41,12 +69,12 @@ OutputVector translate_soft_max(const NodeContext & context) {
        logits = std::make_shared<ov::op::v1::Multiply>(logits, scale_const);
    }

-    FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && context.get_input_size() < 2),
+    FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && !has_mask),
                                "OpenVINO softmax ALiBi path requires mask input");

    // Optional mask add: logits += mask
    // For max_bias > 0 (ALiBi), apply per-head slope to mask before adding.
-    if (context.get_input_size() > 1) {
+    if (has_mask) {
        ov::Output<ov::Node> mask = context.get_input(1);

        // For stateful
@@ -94,8 +122,40 @@ OutputVector translate_soft_max(const NodeContext & context) {
        logits = std::make_shared<ov::op::v1::Add>(logits, mask);
    }

+    ov::Output<ov::Node> softmax_input = logits;
+    if (has_sinks) {
+        ov::Output<ov::Node> sinks = context.get_input(sinks_input_idx);
+        if (sinks.get_element_type() != logits.get_element_type()) {
+            sinks = std::make_shared<ov::op::v0::Convert>(sinks, logits.get_element_type());
+        }
+
+        auto sink_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, {1, -1, 1, 1});
+        auto sinks_4d = std::make_shared<ov::op::v1::Reshape>(sinks, sink_shape, false);
+
+        auto logits_shape = std::make_shared<ov::op::v3::ShapeOf>(logits, ov::element::i64);
+        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
+        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+        auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
+        auto four = ov::op::v0::Constant::create(ov::element::i64, {1}, {4});
+        auto shape_axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
+
+        auto sink_prefix_shape = std::make_shared<ov::op::v8::Slice>(logits_shape, zero, three, one, shape_axis);
+        auto sink_last_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+        auto sink_broadcast_shape = std::make_shared<ov::op::v0::Concat>(
+            ov::OutputVector{sink_prefix_shape, sink_last_dim}, 0);
+        auto sink_column = std::make_shared<ov::op::v3::Broadcast>(sinks_4d, sink_broadcast_shape,
+                                                                   ov::op::BroadcastType::BIDIRECTIONAL);
+        softmax_input = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{logits, sink_column}, 3);
+
+        auto softmax_with_sink = std::make_shared<ov::op::v8::Softmax>(softmax_input, -1);
+        auto original_last_dim = std::make_shared<ov::op::v8::Slice>(logits_shape, three, four, one, shape_axis);
+        auto res = std::make_shared<ov::op::v8::Slice>(softmax_with_sink, zero, original_last_dim, one, three);
+
+        return rename_outputs_with_suffix({res}, context.get_name());
+    }
+
    // Softmax along last dimension (equivalent to ggml softmax over ne[0]).
-    auto res = std::make_shared<ov::op::v8::Softmax>(logits, -1);
+    auto res = std::make_shared<ov::op::v8::Softmax>(softmax_input, -1);

    return rename_outputs_with_suffix({res}, context.get_name());
 }
@@ -47,6 +47,7 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
        {"GGML_UNARY_OP_TANH",      op::translate_1to1_match_1_input<v0::Tanh>     },
        {"GGML_OP_VIEW",            op::translate_view                             },
        {"GGML_GLU_OP_SWIGLU",      op::translate_glu_swiglu                       },
+        {"GGML_GLU_OP_SWIGLU_OAI",  op::translate_glu_swiglu_oai                   },
        {"GGML_GLU_OP_GEGLU",       op::translate_glu_geglu                        },
        {"GGML_OP_SET_ROWS",        op::translate_set_rows                         },
        {"GGML_OP_CPY",             op::translate_cpy                              },
@@ -32,6 +32,7 @@ GGML_OP_CONVERTER(translate_soft_max);
 GGML_OP_CONVERTER(translate_transpose);
 GGML_OP_CONVERTER(translate_view);
 GGML_OP_CONVERTER(translate_glu_swiglu);
+GGML_OP_CONVERTER(translate_glu_swiglu_oai);
 GGML_OP_CONVERTER(translate_glu_geglu);
 GGML_OP_CONVERTER(translate_set_rows);
 GGML_OP_CONVERTER(translate_cpy);
@@ -2,8 +2,10 @@
 #include "ggml-sycl/common.hpp"
 #include "ggml-sycl/presets.hpp"

-static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
-        const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
+static void norm_f32(const float* x, float* dst, const int ncols,
+    const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
+    const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
+    const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {

    const int nrows = item_ct1.get_group_range(2);
    const int nchannels = item_ct1.get_group_range(1);
@@ -16,16 +18,16 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
    const int tid = item_ct1.get_local_id(2);
    const int nwarps = nthreads / WARP_SIZE;

-    const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
-    const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
+    const auto src_offset = calculate_offset<3>({src_stride_sample, src_stride_channel, src_stride_row}, {sample, channel, row});
+    const auto dst_offset = calculate_offset<3>({dst_stride_sample, dst_stride_channel, dst_stride_row}, {sample, channel, row});

-    x += strided_offset;
-    dst += packed_offset;
+    x += src_offset;
+    dst += dst_offset;

    sycl::float2 mean_var = sycl::float2(0.f, 0.f);

    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[col];
+        const float xi = x[col * src_stride_col];
        mean_var.x() += xi;
        mean_var.y() += xi * xi;
    }
@@ -54,7 +56,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
    const float inv_std = sycl::rsqrt(var + eps);

    for (int col = tid; col < ncols; col += block_size) {
-        dst[col] = (x[col] - mean) * inv_std;
+        dst[col * dst_stride_col] = (x[col * src_stride_col] - mean) * inv_std;
    }
 }

@@ -145,8 +147,10 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
    }
 }

-static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
-        const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
+static void rms_norm_f32(const float* x, float* dst, const int ncols,
+    const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
+    const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
+    const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {

    const int nrows = item_ct1.get_group_range(2);
    const int nchannels = item_ct1.get_group_range(1);
@@ -160,17 +164,17 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
    const int tid = item_ct1.get_local_id(2);
    const int nwarps = nthreads / WARP_SIZE;

-    const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
-    const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
+    const auto src_offset = calculate_offset<3>({src_stride_sample, src_stride_channel, src_stride_row}, {sample, channel, row});
+    const auto dst_offset = calculate_offset<3>({dst_stride_sample, dst_stride_channel, dst_stride_row}, {sample, channel, row});

-    x   += strided_offset;
-    dst += packed_offset;
+    x   += src_offset;
+    dst += dst_offset;


    float tmp = 0.0f; // partial sum for thread in warp

    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[col];
+        const float xi = x[col * src_stride_col];
        tmp += xi * xi;
    }

@@ -198,14 +202,15 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
    const float scale = sycl::rsqrt(mean + eps);

    for (int col = tid; col < ncols; col += block_size) {
-        dst[col] = scale * x[col];
+        dst[col * dst_stride_col] = scale * x[col * src_stride_col];
    }
 }

 template<int warp_size>
 static void l2_norm_f32(const float * x, float * dst, const int ncols,
-    const int64_t stride_row, const int64_t stride_channel,
-    const int64_t stride_sample, const float eps,
+    const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel,
+    const int64_t src_stride_sample, const int64_t dst_stride_col, const int64_t dst_stride_row,
+    const int64_t dst_stride_channel, const int64_t dst_stride_sample, const float eps,
    const sycl::nd_item<3>& item_ct1, float* s_sum, const int block_size) {
    const int nrows     = item_ct1.get_group_range(2);
    const int nchannels = item_ct1.get_group_range(1);
@@ -215,13 +220,13 @@ static void l2_norm_f32(const float * x, float * dst, const int ncols,
    const int sample  = item_ct1.get_group(0);
    const int tid     = item_ct1.get_local_id(2);

-    x   += sample*stride_sample + channel*stride_channel + row*stride_row;
-    dst += ((sample*nchannels + channel)*nrows + row)*ncols;
+    x   += sample*src_stride_sample + channel*src_stride_channel + row*src_stride_row;
+    dst += sample*dst_stride_sample + channel*dst_stride_channel + row*dst_stride_row;

    float tmp = 0.0f; // partial sum for thread in warp

    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[col];
+        const float xi = x[col * src_stride_col];
        tmp += xi * xi;
    }

@@ -229,12 +234,13 @@ static void l2_norm_f32(const float * x, float * dst, const int ncols,
    const float scale = sycl::rsqrt(sycl::fmax(tmp, eps * eps));

    for (int col = tid; col < ncols; col += block_size) {
-        dst[col] = scale * x[col];
+        dst[col * dst_stride_col] = scale * x[col * src_stride_col];
    }
 }

 static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
-        const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
+    const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
+    const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
        const float eps, queue_ptr stream, int device) {

    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
@@ -245,7 +251,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
+                    norm_f32(x, dst, ncols,
+                        src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
+                        dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
+                        eps, item_ct1, nullptr, WARP_SIZE);
                });
            });
    }
@@ -265,7 +274,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
+                    norm_f32(x, dst, ncols,
+                        src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
+                        dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
+                        eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
                });
            });
    }
@@ -319,7 +331,9 @@ static void group_norm_f32_sycl(const float* x, float* dst,
 }

 static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
-        const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
+    const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
+    const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
+    const float eps, queue_ptr stream, int device) {
    // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);

    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
@@ -330,7 +344,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
+                    rms_norm_f32(x, dst, ncols,
+                        src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
+                        dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
+                        eps, item_ct1, nullptr, WARP_SIZE);
                });
            });
    }
@@ -350,7 +367,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
+                    rms_norm_f32(x, dst, ncols,
+                        src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
+                        dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
+                        eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
                });
            });
    }
@@ -363,9 +383,14 @@ static void l2_norm_f32_sycl(const float *   x,
                             const int       nrows,
                             const int       nchannels,
                             const int       nsamples,
-                             const int64_t   stride_row,
-                             const int64_t   stride_channel,
-                             const int64_t   stride_sample,
+                             const int64_t   src_stride_col,
+                             const int64_t   src_stride_row,
+                             const int64_t   src_stride_channel,
+                             const int64_t   src_stride_sample,
+                             const int64_t   dst_stride_col,
+                             const int64_t   dst_stride_row,
+                             const int64_t   dst_stride_channel,
+                             const int64_t   dst_stride_sample,
                             const float     eps,
                             queue_ptr       stream,
                             int             device) {
@@ -379,7 +404,10 @@ static void l2_norm_f32_sycl(const float *   x,
                    block_dims),
                [=](sycl::nd_item<3> item_ct1)
                [[sycl::reqd_sub_group_size(warp_size)]] {
-                    l2_norm_f32<warp_size>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
+                    l2_norm_f32<warp_size>(x, dst, ncols,
+                        src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
+                        dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
+                        eps, item_ct1,
                        nullptr, warp_size);
                });
            });
@@ -398,7 +426,9 @@ static void l2_norm_f32_sycl(const float *   x,
                    block_dims),
                [=](sycl::nd_item<3> item_ct1)
                [[sycl::reqd_sub_group_size(warp_size)]] {
-                    l2_norm_f32<warp_size>(x, dst, ncols, stride_row, stride_channel, stride_sample,
+                    l2_norm_f32<warp_size>(x, dst, ncols,
+                        src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
+                        dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
                        eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
                });
            });
@@ -421,12 +451,20 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
    memcpy(&eps, dst->op_params, sizeof(float));
    GGML_ASSERT(eps >= 0.0f);
    const size_t ts0 = ggml_type_size(src0->type);
-    GGML_ASSERT(nb00 == ts0);
-    const int64_t s01 = nb01 / ts0;
-    const int64_t s02 = nb02 / ts0;
-    const int64_t s03 = nb03 / ts0;
+    const size_t tdst = ggml_type_size(dst->type);
+    GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0);
+    GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0);
+    const int64_t ss0 = nb00 / ts0;
+    const int64_t ss1 = nb01 / ts0;
+    const int64_t ss2 = nb02 / ts0;
+    const int64_t ss3 = nb03 / ts0;
+    const int64_t ds0 = nb0 / tdst;
+    const int64_t ds1 = nb1 / tdst;
+    const int64_t ds2 = nb2 / tdst;
+    const int64_t ds3 = nb3 / tdst;

-    norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
+    norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03,
+        ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, main_stream, ctx.device);
 }

 void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
@@ -465,11 +503,19 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {

    GGML_TENSOR_UNARY_OP_LOCALS
    const size_t ts0 = ggml_type_size(src0->type);
-    GGML_ASSERT(nb00 == ts0);
-    const int64_t s01 = nb01 / ts0;
-    const int64_t s02 = nb02 / ts0;
-    const int64_t s03 = nb03 / ts0;
-    rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
+    const size_t tdst = ggml_type_size(dst->type);
+    GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0);
+    GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0);
+    const int64_t ss0 = nb00 / ts0;
+    const int64_t ss1 = nb01 / ts0;
+    const int64_t ss2 = nb02 / ts0;
+    const int64_t ss3 = nb03 / ts0;
+    const int64_t ds0 = nb0 / tdst;
+    const int64_t ds1 = nb1 / tdst;
+    const int64_t ds2 = nb2 / tdst;
+    const int64_t ds3 = nb3 / tdst;
+    rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03,
+        ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, main_stream, ctx.device);
 }

 void ggml_sycl_op_rms_norm_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
@@ -644,13 +690,21 @@ void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
    GGML_ASSERT(eps >= 0.0f);

    const size_t ts0 = ggml_type_size(src0->type);
-    GGML_ASSERT(nb00 == ts0);
-    const int64_t s01 = nb01 / ts0;
-    const int64_t s02 = nb02 / ts0;
-    const int64_t s03 = nb03 / ts0;
+    const size_t tdst = ggml_type_size(dst->type);
+    GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0);
+    GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0);
+    const int64_t ss0 = nb00 / ts0;
+    const int64_t ss1 = nb01 / ts0;
+    const int64_t ss2 = nb02 / ts0;
+    const int64_t ss3 = nb03 / ts0;
+    const int64_t ds0 = nb0 / tdst;
+    const int64_t ds1 = nb1 / tdst;
+    const int64_t ds2 = nb2 / tdst;
+    const int64_t ds3 = nb3 / tdst;

    /*support both WARP_SIZE or WARP_32_SIZE in code
      choose by hardware for better performance
    */
-    l2_norm_f32_sycl<WARP_SIZE>(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream, ctx.device);
+    l2_norm_f32_sycl<WARP_SIZE>(src0_d, dst_d, ne00, ne01, ne02, ne03,
+            ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, stream, ctx.device);
 }
@@ -126,7 +126,7 @@ static void soft_max_f32(const float *         x,
            break;
        }

-        const float val = sycl::native::exp(vals[col] - max_val);
+        const float val = sycl::native::exp(sycl::max(vals[col] - max_val, -80.0f));
        tmp += val;
        vals[col] = val;
    }
@@ -154,7 +154,7 @@ static void soft_max_f32(const float *         x,
        tmp = warp_reduce_sum<WARP_SIZE>(tmp);
    }
    if (sinks) {
-        tmp += sycl::native::exp(sinks[i02] - max_val);
+        tmp += sycl::native::exp(sycl::max(sinks[i02] - max_val, -80.0f));
    }
    const float inv_sum = 1.0f / tmp;

@@ -308,6 +308,7 @@ enum vk_device_architecture {
    AMD_RDNA1,
    AMD_RDNA2,
    AMD_RDNA3,
+    INTEL_XE1,
    INTEL_XE2,
    NVIDIA_PRE_TURING,
    NVIDIA_TURING,
@@ -365,21 +366,26 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
        const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();

        bool subgroup_size_control = false;
+        bool integer_dot_product = false;

        for (const auto& properties : ext_props) {
            if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
                subgroup_size_control = true;
+            } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) {
+                integer_dot_product = true;
            }
        }

-        if (!subgroup_size_control) {
+        if (!subgroup_size_control || !integer_dot_product) {
            return vk_device_architecture::OTHER;
        }

        vk::PhysicalDeviceProperties2 props2;
        vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
+        vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;

        props2.pNext = &subgroup_size_control_props;
+        subgroup_size_control_props.pNext = &integer_dot_props;
        device.getProperties2(&props2);

        if (subgroup_size_control_props.minSubgroupSize == 16) {
@@ -388,6 +394,9 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
            // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
            // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
            return vk_device_architecture::INTEL_XE2;
+        } else if (subgroup_size_control_props.minSubgroupSize == 8 &&
+                 integer_dot_product && integer_dot_props.integerDotProduct4x8BitPackedSignedAccelerated) {
+            return vk_device_architecture::INTEL_XE1;
        }
    } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
        const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
@@ -3837,7 +3846,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
            l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
            l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
            l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 };
-        } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) {
+        } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support) {
            // Xe2/Xe3 with coopmat enabled - warptile performance tuning
            l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
            l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
@@ -4710,7 +4719,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
    }
    uint32_t rm_iq = 2 * rm_kq;

-    const bool use_subgroups = device->subgroup_arithmetic && device->architecture != vk_device_architecture::AMD_GCN;
+    const bool use_subgroups = device->subgroup_arithmetic;
    // Ensure a subgroup size >= 16 is available
    const bool use_subgroups16 = use_subgroups && subgroup_min_size_16;

@@ -6361,9 +6370,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
                break;
            case VK_VENDOR_ID_INTEL: {
                // Current Windows driver does not expose BF16 support.
-                // We only want to use l_warptile if coopmat is available and is Xe2+
-                const bool xe2_with_coopmat = device->coopmat_support && device->architecture == INTEL_XE2;
-                const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && xe2_with_coopmat) : xe2_with_coopmat;
+                // We only want to use l_warptile if coopmat is available
+                const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && device->coopmat_support) : device->coopmat_support;
                device->mul_mat_l[i] = use_l_warptile;
                device->mul_mat_id_l[i] = use_l_warptile;
                device->mul_mat_m[i] = true;
@@ -17890,9 +17898,9 @@ static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) {
 static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
    switch (props.vendorID) {
    case VK_VENDOR_ID_INTEL:
-        // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost,
-        // while some older hardware (ex. Arc A770) has performance regressions
-        return arch == vk_device_architecture::INTEL_XE2;
+        // Only allowing Xe2/Xe3 GPU and integrated Xe GPUs at the moment since older hardware (ex. Arc A770) has performance regressions.
+        return (arch == vk_device_architecture::INTEL_XE2) ||
+            (arch == vk_device_architecture::INTEL_XE1 && props.deviceType == vk::PhysicalDeviceType::eIntegratedGpu && driver_props.driverID == vk::DriverId::eIntelProprietaryWindows);
    case VK_VENDOR_ID_AMD:
        if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
            // Workaround for AMD proprietary driver reporting support on all GPUs
@@ -17940,6 +17948,8 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev)
    case 0xE20B:  // B580
    case 0xE211:  // Pro B60
        return 20;
+    case 0xB080:  // PTL Xe3 LPG 2x6 (12 subslices)
+        return 12;
    default:
        return 0;
    }
@@ -158,7 +158,7 @@ const uint32_t Csh_stride = BS_NPQ;
 #ifdef COOPMAT
 const uint32_t Csh_len    = BS_K * Csh_stride;
 #else
-const uint32_t Csh_len    = csh_store != 0 ? BS_K * Csh_stride : 1;
+const uint32_t Csh_len    = csh_store != 0 ? BS_K * Csh_stride : 8; // 8 to workaround compiler bug
 #endif
 shared SHMEM_TYPE Csh[Csh_len];  // K x NPQ
 #endif
@@ -144,7 +144,7 @@ const uint32_t Csh_stride = BS_NPQ;
 #ifdef COOPMAT
 const uint32_t Csh_len    = BS_K * Csh_stride;
 #else
-const uint32_t Csh_len    = csh_store != 0 ? BS_K * Csh_stride : 1;
+const uint32_t Csh_len    = csh_store != 0 ? BS_K * Csh_stride : 8; // 8 to workaround compiler bug
 #endif
 shared SHMEM_TYPE Csh[Csh_len];  // K x NPQ
 #endif
@@ -28,13 +28,10 @@ vec2 cache_b_ds;

 #include "mul_mat_vecq_funcs.glsl"

-void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i) {
+void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint col, const uint b_qs_idx) {
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        const uint col = i*BLOCK_SIZE + tid*K_PER_ITER;
-
        // Preload data_b block
        const uint b_block_idx = (j*p.batch_stride_b + col) / QUANT_K_Q8_1 + b_offset;
-        const uint b_qs_idx = tid % (32 / K_PER_ITER);
        const uint b_block_idx_outer = b_block_idx / 4;
        const uint b_block_idx_inner = b_block_idx % 4;
        cache_b_ds = vec2(data_b[b_block_idx_outer].ds[b_block_idx_inner]);
@@ -91,35 +88,35 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
        }
    }

-    uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
-    if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) {
+    const uint col_stride = K_PER_ITER * BLOCK_SIZE;
+    uint num_iters = p.ncols / col_stride;
+    if (num_iters * col_stride + K_PER_ITER * tid < p.ncols) {
        num_iters++;
    }
-    int unroll_count = 4;
-    uint unrolled_iters = num_iters & ~(unroll_count - 1);

-    uint i = 0;
-    while (i < unrolled_iters) {
+    const uint b_qs_idx = tid % (32 / K_PER_ITER);
+    uint col = tid * K_PER_ITER;
+    while (num_iters >= 4) {
        // Manually partially unroll the loop
-        [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
-            iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
-            i++;
+        [[unroll]] for (uint k = 0; k < 4; ++k) {
+            iter(temp, first_row, num_rows, col, b_qs_idx);
+            col += col_stride;
        }
+
+        num_iters -= 4;
    }

-    unroll_count = 2;
-    unrolled_iters = num_iters & ~(unroll_count - 1);
-
-    while (i < unrolled_iters) {
+    if (num_iters >= 2) {
        // Manually partially unroll the loop
-        [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
-            iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
-            i++;
-        }
+        iter(temp, first_row, num_rows, col, b_qs_idx);
+        col += col_stride;
+        iter(temp, first_row, num_rows, col, b_qs_idx);
+        col += col_stride;
+        num_iters -= 2;
    }
-    while (i < num_iters) {
-        iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
-        i++;
+
+    if (num_iters > 0) {
+        iter(temp, first_row, num_rows, col, b_qs_idx);
    }

    reduce_result(temp, d_offset, first_row, num_rows, tid);
@@ -42,7 +42,7 @@ float op_leaky_relu(float x) {
 }

 float op_step(float x) {
-    return x >= 0.0f ? 1.0f : 0.0f;
+    return x > 0.0f ? 1.0f : 0.0f;
 }

 float op_tanh(float x) {
@@ -1 +1 @@
-707321c4cf6d21cb4bc831aa8b687dbf01a521ce
+eced84c86f8b012c752c016f7fe789adea168e1e
@@ -256,7 +256,7 @@ llama_context::llama_context(
    LLAMA_LOG_INFO("%s: n_outputs_max = %u\n",   __func__, cparams.n_outputs_max);

    if (cparams.n_ctx_seq < hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
+        LLAMA_LOG_INFO("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
                __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
    }

@@ -169,7 +169,6 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
    GGML_ASSERT(ubatch.equal_seqs());
    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
    GGML_ASSERT(d_inner % n_head  == 0);
-    GGML_ASSERT(d_inner % d_state == 0);
    GGML_ASSERT(d_inner % n_group == 0);

    ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
@@ -39,10 +39,11 @@ void llama_model_mamba2::load_arch_tensors(llama_model_loader &) {
    const int64_t d_inner = hparams.ssm_d_inner;
    const int64_t d_state = hparams.ssm_d_state;
    const int64_t n_group = hparams.ssm_n_group;
-    const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
+    const int64_t dt_rank  = hparams.ssm_dt_rank;
+
+    const int64_t conv_dim = d_inner + 2 * n_group * d_state;
+    const int64_t d_in_proj = d_inner + conv_dim + dt_rank;

-    // only an expansion factor of 2 is supported for now
-    GGML_ASSERT(2 * n_embd == d_inner);

    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

@@ -68,11 +69,11 @@ void llama_model_mamba2::load_arch_tensors(llama_model_loader &) {
        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);

-        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
+        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {dt_rank}, 0);

        // no "weight" suffix for these
-        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
-        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
+        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, dt_rank}, 0);
+        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, dt_rank}, 0);

        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);

@@ -302,9 +302,9 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
 llama_build_and_test(test-alloc.cpp)
 target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)

-llama_build(export-graph-ops.cpp)
-target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
+llama_build(test-export-graph-ops.cpp)
+target_include_directories(test-export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
 if (TARGET gguf-model-data)
-    target_link_libraries(export-graph-ops PRIVATE gguf-model-data)
-    target_compile_definitions(export-graph-ops PRIVATE LLAMA_HF_FETCH)
+    target_link_libraries(test-export-graph-ops PRIVATE gguf-model-data)
+    target_compile_definitions(test-export-graph-ops PRIVATE LLAMA_HF_FETCH)
 endif()
@@ -2890,12 +2890,17 @@ struct test_cpy : public test_case {
    const std::array<int64_t, 4> ne_dst;
    const std::array<int64_t, 4> permute_src;
    const std::array<int64_t, 4> permute_dst;
+    const std::array<int64_t, 4> dst_alloc; // if set, dst is a view into a larger buffer (strided)
    bool _src_use_permute;
    bool _dst_use_permute;
    bool _src_transpose;
    bool _use_dst_shape;
+    bool _use_dst_alloc;

    std::string vars() override {
+        if (_use_dst_alloc) {
+            return VARS_TO_STR8(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose, dst_alloc);
+        }
        if (_use_dst_shape) {
            return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose);
        }
@@ -2943,12 +2948,15 @@ struct test_cpy : public test_case {
            std::array<int64_t, 4> ne_dst = {-1, -1, -1, -1},
            std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
            std::array<int64_t, 4> permute_dst = {0, 0, 0, 0},
-            bool transpose_src = false)
+            bool transpose_src = false,
+            std::array<int64_t, 4> dst_alloc = {0, 0, 0, 0})
        : type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst),
+          dst_alloc(dst_alloc),
          _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
          _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0),
          _src_transpose(transpose_src),
-          _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){}
+          _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0),
+          _use_dst_alloc(dst_alloc[0] > 0){}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data());
@@ -2966,12 +2974,23 @@ struct test_cpy : public test_case {
        }

        std::array<int64_t, 4> dst_ne = _use_dst_shape ? ne_dst : std::array<int64_t, 4>{src->ne[0], src->ne[1], src->ne[2], src->ne[3]};
-        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
-        ggml_set_name(dst, "dst");
+        ggml_tensor * dst;

-        if (_dst_use_permute) {
-            dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
-            ggml_set_name(dst, "dst_permuted");
+        if (_use_dst_alloc) {
+            // view a sub-block of a larger buffer -> strided dst
+            ggml_tensor * dst_buf = ggml_new_tensor(ctx, type_dst, 4, dst_alloc.data());
+            ggml_set_name(dst_buf, "dst_buf");
+            dst = ggml_view_4d(ctx, dst_buf, dst_ne[0], dst_ne[1], dst_ne[2], dst_ne[3],
+                dst_buf->nb[1], dst_buf->nb[2], dst_buf->nb[3], 0);
+            ggml_set_name(dst, "dst_view");
+        } else {
+            dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
+            ggml_set_name(dst, "dst");
+
+            if (_dst_use_permute) {
+                dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
+                ggml_set_name(dst, "dst_permuted");
+            }
        }

        ggml_tensor * out = ggml_cpy(ctx, src, dst);
@@ -7973,6 +7992,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
            }
        }
    }
+    for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+        test_cases.emplace_back(new test_conv_2d({ 256, 256, 192, 1 }, { 3, 3, 192, 96 }, kernel_type, 1, 1, 1, 1, 1, 1, false));
+    }

    // sycl backend will limit task global_range < MAX_INT
    // test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
@@ -8178,6 +8200,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2097121, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 524281, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst

    // CPY - different src/dst shapes (reshaping via CPY)
    // Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360.
@@ -8672,6 +8696,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                                                  256, 16, 16, {ne2, 1}, {1, 1}));
    }

+    // nr2 sweep to cover the cublasSgemmBatched pointer-array path (dps2 > 1)
+    for (int64_t nr2 : {8, 16, 32}) {
+        test_cases.emplace_back(new test_out_prod(GGML_TYPE_F32, GGML_TYPE_F32,
+                                                  256, 16, 16, {1, 1}, {nr2, 1}));
+    }
+
    // add_id
    for (ggml_type type_a : {GGML_TYPE_F32}) {
        for (ggml_type type_b : {GGML_TYPE_F32}) {
@@ -9934,7 +9964,7 @@ static void usage(char ** argv) {
    printf("    --output specifies output format (default: console, options: console, sql, csv)\n");
    printf("    --list-ops lists all available GGML operations\n");
    printf("    --show-coverage shows test coverage\n");
-    printf("    --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
+    printf("    --test-file reads test operators from a test file generated by test-export-graph-ops\n");
    printf("    -j <n> runs tests using <n> parallel worker threads (default: 1, test mode only)\n");
 }

@@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
            output_path = args[i + 1];
            i++;
        } else if (args[i] == "--no-common") {
-            use_common = true;
+            use_common = false;
        } else if (tmpl_path.empty()) {
            tmpl_path = args[i];
        } else {
@@ -185,7 +185,7 @@ int main(int argc, char ** argv) {
            return 1;
        }
 #else
-        LOG_ERR("export-graph-ops compiled without HF fetch support\n");
+        LOG_ERR("test-export-graph-ops compiled without HF fetch support\n");
        return 1;
 #endif
    }
@@ -102,21 +102,34 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr
    return fabsf(result - dot_ref) / test_size;
 }

-int main(int argc, char * argv[]) {
-    bool verbose = false;
-    const size_t test_size = 32 * 128;
+static int test_vec_dot_f32(bool verbose) {
+    const auto * f32 = ggml_get_type_traits_cpu(GGML_TYPE_F32);
+    int num_failed = 0;
+    for (int n : {1, 2, 3, 5, 7, 8, 15, 16, 17, 31, 33, 63, 67, 127, 129, 193, 255, 1023}) {
+        std::vector<float> a(n);
+        std::vector<float> b(n);
+        generate_data(0.0, n, a.data());
+        generate_data(1.0, n, b.data());

-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
+        float result = 0.0f;
+        f32->vec_dot(n, &result, 0, a.data(), 0, b.data(), 0, 1);
+        const float ref = dot_product(a.data(), b.data(), n);
+        const float error = fabsf(result - ref) / n;

-        if (arg == "-v") {
-            verbose = true;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            return 1;
+        const bool failed = !(error < MAX_QUANTIZATION_REFERENCE_ERROR);
+        num_failed += failed;
+        if (failed || verbose) {
+            printf(" f32 vec_dot n=%4d:                 %s (ref=%f got=%f err=%f)\n",
+                   n, RESULT_STR[failed], ref, result, error);
        }
    }
+    return num_failed;
+}
+
+static int test_vec_dot_q(bool verbose) {
+    int num_failed = 0;
+
+    const size_t test_size = 32 * 128;

    std::vector<float> test_data(test_size);
    std::vector<float> test_data2(test_size);
@@ -124,11 +137,6 @@ int main(int argc, char * argv[]) {
    generate_data(0.0, test_data.size(), test_data.data());
    generate_data(1.0, test_data2.size(), test_data2.data());

-    ggml_cpu_init();
-
-    int num_failed = 0;
-    bool failed = false;
-
    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
        ggml_type type = (ggml_type) i;
        const auto * qfns = ggml_get_type_traits(type);
@@ -156,7 +164,7 @@ int main(int argc, char * argv[]) {
                type == GGML_TYPE_IQ3_S   ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
                type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS :
                type == GGML_TYPE_NVFP4   ? MAX_QUANTIZATION_TOTAL_ERROR_FP4 : MAX_QUANTIZATION_TOTAL_ERROR;
-            failed = !(total_error < max_quantization_error);
+            bool failed = !(total_error < max_quantization_error);
            num_failed += failed;
            if (failed || verbose) {
                printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
@@ -171,15 +179,15 @@ int main(int argc, char * argv[]) {

            const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
            const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
-                                            type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
-                                          ? MAX_DOT_PRODUCT_ERROR_LOWBIT
-                                          : type == GGML_TYPE_Q1_0
-                                          ? MAX_DOT_PRODUCT_ERROR_BINARY
-                                          : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
-                                          ? MAX_DOT_PRODUCT_ERROR_TERNARY
-                                          : type == GGML_TYPE_NVFP4
-                                          ? MAX_DOT_PRODUCT_ERROR_FP4
-                                          : MAX_DOT_PRODUCT_ERROR;
+                type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
+                ? MAX_DOT_PRODUCT_ERROR_LOWBIT
+                : type == GGML_TYPE_Q1_0
+                ? MAX_DOT_PRODUCT_ERROR_BINARY
+                : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
+                ? MAX_DOT_PRODUCT_ERROR_TERNARY
+                : type == GGML_TYPE_NVFP4
+                ? MAX_DOT_PRODUCT_ERROR_FP4
+                : MAX_DOT_PRODUCT_ERROR;
            failed = !(vec_dot_error < max_allowed_error);
            num_failed += failed;
            if (failed || verbose) {
@@ -188,6 +196,31 @@ int main(int argc, char * argv[]) {
        }
    }

+    return num_failed;
+}
+
+int main(int argc, char * argv[]) {
+    bool verbose = false;
+
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-v") {
+            verbose = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+
+    ggml_cpu_init();
+
+    int num_failed = 0;
+
+    num_failed += test_vec_dot_f32(verbose);
+    num_failed += test_vec_dot_q(verbose);
+
    if (num_failed || verbose) {
        printf("%d tests failed\n", num_failed);
    }
@@ -55,8 +55,7 @@ struct clip_hparams {
    int32_t n_head = 0;
    int32_t n_head_kv = 0;
    int32_t n_layer = 0;
-    // idefics3
-    int32_t n_merge = 0; // number of patch merges **per-side**
+    int32_t n_merge = 1; // number of patch merges **per-side**

    // for preprocessor
    int32_t image_longest_edge = 0;
@@ -135,8 +134,7 @@ struct clip_hparams {
    int32_t custom_image_max_tokens = -1;

    void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
-        const int cur_merge = n_merge == 0 ? 1 : n_merge;
-        const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
+        const int patch_area = patch_size * patch_size * n_merge * n_merge;
        image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
        image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
        warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
@@ -145,8 +143,7 @@ struct clip_hparams {
    void set_warmup_n_tokens(int n_tokens) {
        int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
        GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
-        const int cur_merge = n_merge == 0 ? 1 : n_merge;
-        warmup_image_size = n_tok_per_side * patch_size * cur_merge;
+        warmup_image_size = n_tok_per_side * patch_size * n_merge;
        // TODO: support warmup size for custom token numbers
    }
    // sam vit deepseek-ocr
@@ -1210,6 +1210,9 @@ struct clip_model_loader {
            {
                std::vector<int> pinpoints;
                get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
+                if (pinpoints.size() % 2 != 0) {
+                    throw std::runtime_error(string_format("%s: image_grid_pinpoints must have an even number of elements, got %zu\n", __func__, pinpoints.size()));
+                }
                if (!pinpoints.empty()) {
                    for (size_t i = 0; i < pinpoints.size(); i += 2) {
                        hparams.image_res_candidates.push_back({
@@ -1252,15 +1255,16 @@ struct clip_model_loader {
            }

            if (is_vision) {
-                int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
-                int idx_std  = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
-                GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
-                GGML_ASSERT(idx_std >= 0  && "image_std not found");
-                const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean);
-                const float * std_data  = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std);
+                std::vector<float> image_mean;
+                std::vector<float> image_std;
+                get_arr_f32(KEY_IMAGE_MEAN, image_mean, false);
+                get_arr_f32(KEY_IMAGE_STD , image_std, false);
+                if (image_mean.size() < 3 || image_std.size() < 3) {
+                    throw std::runtime_error(string_format("%s: image_mean/image_std arrays must have at least 3 elements, got %zu and %zu\n", __func__, image_mean.size(), image_std.size()));
+                }
                for (int i = 0; i < 3; ++i) {
-                    hparams.image_mean[i] = mean_data[i];
-                    hparams.image_std[i]  = std_data[i];
+                    hparams.image_mean[i] = image_mean[i];
+                    hparams.image_std[i]  = image_std[i];
                }
            }

@@ -1686,8 +1690,8 @@ struct clip_model_loader {
                if (hparams.image_size > 65536) {
                    throw std::runtime_error(string_format("%s: image_size (%d) is too large (max 65536)\n", __func__, hparams.image_size));
                }
-                if (hparams.patch_size <= 0) {
-                    throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size));
+                if (hparams.patch_size <= 0 || hparams.patch_size >= 65536) {
+                    throw std::runtime_error(string_format("%s: patch_size (%d) must be positive and less than 65536\n", __func__, hparams.patch_size));
                }
                if (hparams.n_embd <= 0) {
                    throw std::runtime_error(string_format("%s: n_embd (%d) must be greater than 0\n", __func__, hparams.n_embd));
@@ -1695,6 +1699,9 @@ struct clip_model_loader {
                if (hparams.image_max_pixels < hparams.image_min_pixels) {
                    throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
                }
+                if (hparams.n_merge < 0 || hparams.n_merge >= 65536) {
+                    throw std::runtime_error(string_format("%s: n_merge (%d) must be greater than 0 and less than 65536\n", __func__, hparams.n_merge));
+                }
            }

            LOG_INF("%s: projector:          %s\n", __func__, proj_type.c_str());
@@ -3067,6 +3074,29 @@ struct clip_model_loader {
        output = gguf_get_val_f32(ctx_gguf.get(), i);
    }

+    void get_arr_f32(const std::string & key, std::vector<float> & output, bool required = true) const {
+        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+        if (i < 0) {
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
+            return;
+        }
+        const auto type = gguf_get_arr_type(ctx_gguf.get(), i);
+        if (type != GGUF_TYPE_FLOAT32) {
+            throw std::runtime_error(string_format("%s: array '%s' has type %d, expected %d (GGUF_TYPE_FLOAT32)\n", __func__, key.c_str(), type, GGUF_TYPE_FLOAT32));
+        }
+        const size_t n = gguf_get_arr_n(ctx_gguf.get(), i);
+        if (n > (size_t) std::numeric_limits<int>::max()) {
+            throw std::runtime_error(string_format("%s: array '%s' is too large (%zu elements)\n", __func__, key.c_str(), n));
+        }
+        output.resize(n);
+        const float * values = (const float *)gguf_get_arr_data(ctx_gguf.get(), i);
+        for (size_t j = 0; j < n; ++j) {
+            output[j] = values[j];
+        }
+    }
+
    void get_string(const std::string & key, std::string & output, bool required = true) const {
        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
        if (i < 0) {
@@ -3086,11 +3116,18 @@ struct clip_model_loader {
            }
            return;
        }
-        int n = gguf_get_arr_n(ctx_gguf.get(), i);
+        const auto type = gguf_get_arr_type(ctx_gguf.get(), i);
+        if (type != GGUF_TYPE_INT32) {
+            throw std::runtime_error(string_format("%s: array '%s' has type %d, expected %d (GGUF_TYPE_INT32)\n", __func__, key.c_str(), type, GGUF_TYPE_INT32));
+        }
+        const size_t n = gguf_get_arr_n(ctx_gguf.get(), i);
+        if (n > (size_t) std::numeric_limits<int>::max()) {
+            throw std::runtime_error(string_format("%s: array '%s' is too large (%zu elements)\n", __func__, key.c_str(), n));
+        }
        output.resize(n);
        const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i);
-        for (int i = 0; i < n; ++i) {
-            output[i] = values[i];
+        for (size_t j = 0; j < n; ++j) {
+            output[j] = values[j];
        }
    }

@@ -3364,8 +3401,8 @@ int clip_n_output_tokens(const clip_ctx * ctx, const clip_image_f32 * img) {
            {
                // dynamic size
                int n_merge = ctx->model.hparams.n_merge;
-                int n_patches_x = img->nx() / patch_size / (n_merge > 0 ? n_merge : 1);
-                int n_patches_y = img->ny() / patch_size / (n_merge > 0 ? n_merge : 1);
+                int n_patches_x = img->nx() / patch_size / n_merge;
+                int n_patches_y = img->ny() / patch_size / n_merge;
                if (ctx->model.token_embd_img_break) {
                    n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
                } else {
@@ -63,8 +63,8 @@ ggml_cgraph * clip_graph_pixtral::build() {
        // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
        // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]

-        const int p_y             = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
-        const int p_x             = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
+        const int p_y             = n_patches_y / n_merge;
+        const int p_x             = n_patches_x / n_merge;
        const int p_total         = p_x * p_y;
        const int n_embd_text     = cur->ne[0];
        const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
@@ -628,7 +628,7 @@ mtmd_image_preproc_out mtmd_image_preprocessor_llava_uhd::preprocess(const clip_
 mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_llava_uhd::get_slice_instructions(const clip_image_size & original_size) {
    mtmd_image_preprocessor_llava_uhd::slice_instructions res;
    // align slices by patch_size * n_merge so an integer number of merger output tokens fits per slice
-    const int n_merge         = hparams.n_merge > 0 ? hparams.n_merge : 1;
+    const int n_merge         = hparams.n_merge;
    const int patch_size      = hparams.patch_size * n_merge;
    const int slice_size      = hparams.image_size;
    const int original_width  = original_size.width;
@@ -894,7 +894,7 @@ mtmd_image_preproc_out mtmd_image_preprocessor_dyn_size::preprocess(const clip_i
    clip_image_u8 resized_image;
    const clip_image_size original_size = img.get_size();
    // the original pixtral model doesn't have n_merge
-    const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
+    const int cur_merge = hparams.n_merge;
    const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
        original_size,
        hparams.patch_size * cur_merge,
@@ -1,4 +1,4 @@
-set(TARGET rpc-server)
+set(TARGET ggml-rpc-server)
 add_executable(${TARGET} rpc-server.cpp)
 target_link_libraries(${TARGET} PRIVATE ggml)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -4,8 +4,8 @@
 > This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
 > insecure. **Never run the RPC server on an open network or in a sensitive environment!**

-The `rpc-server` allows exposing `ggml` devices on a remote host.
-The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
+The `ggml-rpc-server` allows exposing `ggml` devices on a remote host.
+The RPC backend communicates with one or several instances of `ggml-rpc-server` and offloads computations to them.
 This can be used for distributed LLM inference with `llama.cpp` in the following way:

 ```mermaid
@@ -14,15 +14,15 @@ flowchart TD
    rpcb<-->|TCP|srvb
    rpcb<-.->|TCP|srvn
    subgraph hostn[Host N]
-    srvn[rpc-server]<-.->dev4["CUDA0"]
-    srvn[rpc-server]<-.->dev5["CPU"]
+    srvn[ggml-rpc-server]<-.->dev4["CUDA0"]
+    srvn[ggml-rpc-server]<-.->dev5["CPU"]
    end
    subgraph hostb[Host B]
-    srvb[rpc-server]<-->dev3["Metal"]
+    srvb[ggml-rpc-server]<-->dev3["Metal"]
    end
    subgraph hosta[Host A]
-    srva[rpc-server]<-->dev["CUDA0"]
-    srva[rpc-server]<-->dev2["CUDA1"]
+    srva[ggml-rpc-server]<-->dev["CUDA0"]
+    srva[ggml-rpc-server]<-->dev2["CUDA1"]
    end
    subgraph host[Main Host]
    local["Local devices"]<-->ggml[llama-cli]
@@ -33,7 +33,7 @@ flowchart TD
    class local,dev,dev2,dev3,dev4,dev5 devcls
 ```

-By default, `rpc-server` exposes all available accelerator devices on the host.
+By default, `ggml-rpc-server` exposes all available accelerator devices on the host.
 If there are no accelerators, it exposes a single `CPU` device.

 ## Usage
@@ -41,7 +41,7 @@ If there are no accelerators, it exposes a single `CPU` device.
 ### Remote hosts

 On each remote host, build the backends for each accelerator by adding `-DGGML_RPC=ON` to the build options.
-For example, to build the `rpc-server` with support for CUDA accelerators:
+For example, to build the `ggml-rpc-server` with support for CUDA accelerators:

 ```bash
 mkdir build-rpc-cuda
@@ -50,10 +50,10 @@ cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
 cmake --build . --config Release
 ```

-When started, the `rpc-server` will detect and expose all available `CUDA` devices:
+When started, the `ggml-rpc-server` will detect and expose all available `CUDA` devices:

 ```bash
-$ bin/rpc-server
+$ bin/ggml-rpc-server
 ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
 ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
 ggml_cuda_init: found 1 CUDA devices:
@@ -67,14 +67,14 @@ Devices:

 You can control the set of exposed CUDA devices with the `CUDA_VISIBLE_DEVICES` environment variable or the `--device` command line option. The following two commands have the same effect:
 ```bash
-$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
-$ bin/rpc-server --device CUDA0 -p 50052
+$ CUDA_VISIBLE_DEVICES=0 bin/ggml-rpc-server -p 50052
+$ bin/ggml-rpc-server --device CUDA0 -p 50052
 ```

 ### Main host

 On the main host build `llama.cpp` with the backends for the local devices and add `-DGGML_RPC=ON` to the build options.
-Finally, when running `llama-cli` or `llama-server`, use the `--rpc` option to specify the host and port of each `rpc-server`:
+Finally, when running `llama-cli` or `llama-server`, use the `--rpc` option to specify the host and port of each `ggml-rpc-server`:

 ```bash
 $ llama-cli -hf ggml-org/gemma-3-1b-it-GGUF -ngl 99 --rpc 192.168.88.10:50052,192.168.88.11:50052
@@ -90,7 +90,7 @@ This can speed up model loading significantly, especially when using large model
 To enable the cache, use the `-c` option:

 ```bash
-$ bin/rpc-server -c
+$ bin/ggml-rpc-server -c
 ```

 By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.
@@ -103,8 +103,8 @@ RDMA is enabled by default when `libibverbs` is found at build time.

 ### Troubleshooting

-Use the `GGML_RPC_DEBUG` environment variable to enable debug messages from `rpc-server`:
+Use the `GGML_RPC_DEBUG` environment variable to enable debug messages from `ggml-rpc-server`:
 ```bash
-$ GGML_RPC_DEBUG=1 bin/rpc-server
+$ GGML_RPC_DEBUG=1 bin/ggml-rpc-server
 ```

@@ -15,6 +15,8 @@ add_library(${TARGET} STATIC
    server-common.h
    server-context.cpp
    server-context.h
+    server-stream.cpp
+    server-stream.h
    server-tools.cpp
    server-tools.h
    server-schema.cpp
@@ -57,6 +57,7 @@ The core architecture consists of the following components:
 - `server_tokens`: Unified representation of token sequences (supports both text and multimodal tokens); used by `server_task` and `server_slot`.
 - `server_prompt_checkpoint`: For recurrent (e.g., RWKV) and SWA models, stores snapshots of KV cache state. Enables reuse when subsequent requests share the same prompt prefix, saving redundant computation.
 - `server_models`: Standalone component for managing multiple backend instances (used in router mode). It is completely independent of `server_context`.
+- `stream_session_manager`: Process wide owner of resumable SSE stream sessions (`g_stream_sessions`), keyed by conversation id. Backs the replay buffer that lets a client reattach to a generation after an HTTP disconnect. See the "Resumable streaming" section below.

 ```mermaid
 graph TD
@@ -117,6 +118,58 @@ Here is an example trace of an API request for text completion:
 - As the response is stateless, `server_res_generator` calls `response->update()` to update the response with the current state.
 - `server_res_generator` then calls `response->to_json()` and passes the response to the HTTP layer.

+### Resumable streaming (SSE replay buffer)
+
+By default a streaming generation is bound to its HTTP socket: when the socket drops (refresh, tab close, mobile background, transient network) the generation aborts and the live stream is lost. This feature keeps the generation running server side and lets a client reattach.
+
+It is opt in via the `X-Conversation-Id` header on `POST /v1/chat/completions`. Without the header the OAI strict path is unchanged. The conversation id is the only identity end to end (server map key, client localStorage key, route path), with an optional `::model` suffix for direct routing in router mode.
+
+The feature lives entirely in `server-stream.{h,cpp}` and rests on three types:
+
+- `stream_session`: a bounded ring buffer (4 MiB cap, oldest bytes drop first) plus a condvar. `append` pushes raw SSE bytes, `read_from` drains from any offset and blocks for live bytes or finalize, `finalize` wakes readers, `cancel` stops the producer. One conv maps to at most one live session.
+- `stream_session_manager` (`g_stream_sessions`): owns all sessions keyed by conv id, enforces the one conv one session invariant via `create_or_replace`, and runs a GC thread that drops completed sessions past their TTL.
+- `stream_pipe_producer` / `stream_pipe_consumer`: the write and read ends. The producer owns the session lifetime and finalizes it on destruction; the consumer is read only and never finalizes, so a reader detaching cannot kill a running generation.
+
+Producer side: `server_res_generator` attaches a producer pipe when the header is present. The HTTP content provider mirrors every chunk into the ring before writing it to the socket. While a pipe is attached, `stream_aware_should_stop` ignores peer disconnect, so a dropped socket does not stop generation: only an explicit `DELETE` does. When the peer leaves early, `on_complete` calls `close()`, which drains the rest of the generation into the ring on the http worker.
+
+Lifetime safety: the producer pipe holds a shared `alive` flag also captured by the session cancel hook. `~server_res_generator` calls `cleanup()` to clear that hook while the reader is still alive, so a `cancel` arriving during teardown can never call `stop()` on a freed response. This ordering is the most fragile part of the feature: finalizing or destroying the producer before `cleanup()` runs reintroduces a use after free.
+
+Consumer side: `GET /v1/stream/<conv_id>?from=N` opens a `text/event-stream` that replays buffered bytes from offset `N` and blocks for live bytes, so the browser reattaches like a fresh EventSource. An offset below the dropped prefix returns 400.
+
+Routes:
+
+- `GET /v1/stream/:conv_id?from=N`: replay or live reattach.
+- `POST /v1/streams/lookup` with `{"conversation_ids": [...]}`: returns session status only for ids the caller already owns. There is no listing route, so live sessions cannot be enumerated (an earlier `GET /v1/streams` was removed for exactly this reason).
+- `DELETE /v1/stream/:conv_id`: explicit Stop, idempotent (`evict_and_cancel`).
+
+Router mode binds the same paths to proxy handlers. A `conv_id -> child` map (`conv_models`), populated when a POST is routed, resolves the owning child in one lookup with no polling. The lookup groups ids per child; GET and DELETE proxy straight to the owner. This loopback REST hop is expected to move to a websocket IPC later, swapping only the transport.
+
+Lifecycle: `g_stream_sessions.start_gc()` runs in main after common init, `stop_gc()` runs first in `clean_up()` and finalizes every live session so no reader hangs. Reader blocking and the post drop drain both run on httplib worker threads, which block on a condvar rather than spin.
+
+| Constant | Value | Role |
+| --- | --- | --- |
+| `STREAM_SESSION_TTL_SECONDS` | 300 | retention of a completed session before GC |
+| `STREAM_SESSION_MAX_BYTES` | 4 MiB | ring cap per session |
+| `STREAM_SESSION_GC_INTERVAL_SECONDS` | 60 | GC tick |
+| `STREAM_READ_WAKE_INTERVAL_MS` | 200 | read_from wake to recheck should_stop |
+| `STREAM_LOOKUP_TIMEOUT_MS` | 250 | router to child loopback budget |
+
+```mermaid
+graph TD
+    Client -- "POST + X-Conversation-Id" --> RG[server_res_generator]
+    RG -- attach --> Prod[stream_pipe_producer]
+    Prod -- "write, drain on peer drop" --> Sess
+    subgraph g_stream_sessions
+        Sess[stream_session: ring buffer, 4 MiB]
+        GC[GC thread] -- drop after TTL --> Sess
+    end
+    Sess -- read_from offset --> Cons[stream_pipe_consumer]
+    Cons -- "GET /v1/stream/:id?from=N" --> Client
+    DEL[DELETE /v1/stream/:id] -- evict_and_cancel --> Sess
+```
+
+The diagram shows the buffer touch points. The live wire (chunks streamed to the original client during a normal generation) is the producer's default output, described under "Producer side" above.
+
 ### Testing

 `llama-server` includes an automated test suite based on `pytest`.
@@ -223,6 +276,7 @@ The flow for downloading a new model:
 - Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808
 - INI presets: https://github.com/ggml-org/llama.cpp/pull/17859 (+ refactoring: https://github.com/ggml-org/llama.cpp/pull/18169)
 - Sleeping mode: https://github.com/ggml-org/llama.cpp/pull/18228
+- Resumable streaming (SSE replay buffer): https://github.com/ggml-org/llama.cpp/pull/23226



@@ -5,6 +5,7 @@
 #include "server-task.h"
 #include "server-queue.h"
 #include "server-schema.h"
+#include "server-stream.h"

 #include "build-info.h"
 #include "common.h"
@@ -105,7 +106,6 @@ struct server_batch {
        if ((int32_t)tokens.size() >= n_tokens_alloc) {
            return false;
        }
-        // LOG_INF("adding token to batch: slot=%d, token=%d, pos=%d, output=%d\n", id_slot, token, pos, output);
        tokens.push_back({ id_slot, token, pos, output });
        return true;
    }
@@ -227,7 +227,7 @@ struct server_slot {

        const size_t cur_size = cur_size_tgt + cur_size_dft;

-        SRV_WRN(" - saving prompt with length %d, total state size = %.3f MiB (draft: %.3f MiB)\n",
+        SRV_TRC(" - saving prompt with length %d, total state size = %.3f MiB (draft: %.3f MiB)\n",
                (int) prompt.tokens.size(), cur_size / (1024.0 * 1024.0), cur_size_dft / (1024.0 * 1024.0));

        auto * cur = prompt_cache.alloc(prompt, cur_size_tgt, cur_size_dft);
@@ -257,7 +257,7 @@ struct server_slot {
            GGML_ASSERT(!is_processing());
        }

-        SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());
+        SLT_TRC(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());

        common_context_seq_rm(ctx_tgt, id, -1, -1);
        if (ctx_dft) {
@@ -626,8 +626,10 @@ struct server_slot {
            }

            SLT_INF(*this,
-                    "draft acceptance = %0.5f (%5d accepted / %5d generated), mean acceptance length = %5.2f, acceptance rate per position = (%s)\n",
-                    draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len, acceptance_rates_per_pos.c_str());
+                    "draft acceptance = %0.5f (%5d accepted / %5d generated), mean len = %5.2f\n",
+                    draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len);
+            SLT_TRC(*this,
+                    "     acc per pos = (%s)\n", acceptance_rates_per_pos.c_str());
        }

        common_speculative_print_stats(spec);
@@ -770,7 +772,7 @@ struct server_slot {
        }

        // TODO @ngxson : move this log line to debug when it become more stable
-        SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added);
+        SLT_TRC(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added);

        res = mtmd_batch_encode(mbatch.get());
        if (res != 0) {
@@ -1031,7 +1033,8 @@ private:
        }


-        SRV_INF("loading model '%s'\n", params.model.path.c_str());
+        SRV_INF("loading model '%s'\n", params.model.get_name().c_str());
+        SRV_TRC("local path '%s'\n", params.model.path.c_str());

        std::string & mmproj_path = params_base.mmproj.path;
        mtmd_context_params mparams = mtmd_context_params_default();
@@ -1060,7 +1063,7 @@ private:
                for (auto & [dev, size] : mmproj_mem) {
                    total += size;
                }
-                SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB (took %.2f ms)\n", total / (1024.0 * 1024.0), t_elapsed / 1000.0);
+                SRV_TRC("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB (took %.2f ms)\n", total / (1024.0 * 1024.0), t_elapsed / 1000.0);
                GGML_ASSERT(!params_base.fit_params_target.empty());
                for (auto & [dev, size] : mmproj_mem) {
                    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
@@ -1140,7 +1143,7 @@ private:
                            }
                        }
                    }
-                    SRV_INF("[spec] estimated memory usage of %s is %.2f MiB\n",
+                    SRV_TRC("[spec] estimated memory usage of %s is %.2f MiB\n",
                            has_draft ? "draft model" : "MTP context",
                            total / (1024.0 * 1024.0));
                } catch (const std::exception & e) {
@@ -1176,7 +1179,7 @@ private:
            // TODO speculative: move to common/speculative.cpp?
            const auto & params_spec = params_base.speculative.draft;

-            SRV_INF("loading draft model '%s'\n", params_spec.mparams.path.c_str());
+            SRV_TRC("loading draft model '%s'\n", params_spec.mparams.path.c_str());

            auto params_dft = params_base;

@@ -1228,7 +1231,7 @@ private:
            // no new model load, so we simply report 0.0 and 1.0 progress
            load_progress_callback(0.0f, &load_progress_spec);

-            SRV_INF("creating MTP draft context against the target model '%s'\n",
+            SRV_TRC("creating MTP draft context against the target model '%s'\n",
                    params_base.model.path.c_str());

            auto cparams_mtp = common_context_params_to_llama(params_base);
@@ -1302,9 +1305,6 @@ private:
        // Necessary similarity of prompt for slot selection
        slot_prompt_similarity = params_base.slot_prompt_similarity;

-        // setup slots
-        SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
-
        const int n_ctx_train = llama_model_n_ctx_train(model_tgt);

        int n_ctx_slot = llama_n_ctx_seq(ctx_tgt);
@@ -1321,9 +1321,13 @@ private:
        }

        if (ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
-            SRV_WRN("%s", "speculative decoding will use checkpoints\n");
+            SRV_TRC("%s", "speculative decoding will use checkpoints\n");
        }

+        // setup slots
+        SRV_INF("initializing, n_slots = %d, n_ctx_slot = %d, kv_unified = '%s'\n",
+                params_base.n_parallel, n_ctx_slot, params_base.kv_unified ? "true" : "false");
+
        // initialize slots
        for (int i = 0; i < params_base.n_parallel; i++) {
            slots.emplace_back();
@@ -1343,7 +1347,7 @@ private:
        }

        if (spec) {
-            SRV_INF("%s", "speculative decoding context initialized\n");
+            SRV_TRC("%s", "speculative decoding context initialized\n");
        } else {
            ctx_dft.reset();
        }
@@ -1360,7 +1364,7 @@ private:
            slot.mctx                   = mctx;
            slot.prompt.tokens.has_mtmd = mctx != nullptr;

-            SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
+            SLT_TRC(slot, "new slot, n_ctx = %d\n", slot.n_ctx);

            slot.callback_on_release = [this](int id_slot) {
                queue_tasks.pop_deferred_task(id_slot);
@@ -1396,23 +1400,23 @@ private:

        if (params_base.cache_ram_mib != 0) {
            if (params_base.cache_ram_mib < 0) {
-                SRV_INF("prompt cache is enabled, size limit: %s\n", "no limit");
+                SRV_TRC("prompt cache is enabled, size limit: %s\n", "no limit");
            } else {
-                SRV_INF("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
+                SRV_TRC("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
            }
-            SRV_INF("%s", "use `--cache-ram 0` to disable the prompt cache\n");
+            SRV_TRC("%s", "use `--cache-ram 0` to disable the prompt cache\n");

            prompt_cache = std::make_unique<server_prompt_cache>(params_base.cache_ram_mib, n_ctx);
        } else {
-            SRV_INF("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
+            SRV_TRC("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
        }
-        SRV_INF("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
+        SRV_TRC("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");

        if (params_base.n_ctx_checkpoints > 0) {
-            SRV_INF("context checkpoints enabled, max = %d, min spacing = %d\n",
+            SRV_TRC("context checkpoints enabled, max = %d, min spacing = %d\n",
                    params_base.n_ctx_checkpoints, params_base.checkpoint_min_step);
        } else {
-            SRV_INF("%s", "context checkpoints disabled\n");
+            SRV_TRC("%s", "context checkpoints disabled\n");
        }

        if (!params_base.model_alias.empty()) {
@@ -1469,11 +1473,11 @@ private:
                params_base.cache_idle_slots = false;
            } else {
                if (params_base.kv_unified) {
-                    SRV_INF("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n");
+                    SRV_TRC("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n");
                } else {
                    // without a unified KV cache, clearing a slot frees no reusable room, so we only
                    // publish a RAM-cache copy of idle slots (their KV stays in VRAM) [TAG_IDLE_SLOT_CLEAR]
-                    SRV_INF("%s", "idle slots will be saved to prompt cache upon starting a new task\n");
+                    SRV_TRC("%s", "idle slots will be saved to prompt cache upon starting a new task\n");
                }
                SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
            }
@@ -1499,7 +1503,7 @@ private:
            try {
                chat_templates = common_chat_templates_init(model_tgt, params_base.chat_template);

-                LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
+                SRV_TRC("%s: chat template, example_format: '%s'\n", __func__,
                    common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());

            } catch (const std::exception & e) {
@@ -1514,7 +1518,7 @@ private:
            // 2. The chat template supports it
            const bool template_supports_thinking = params_base.use_jinja && common_chat_templates_support_enable_thinking(chat_templates.get());
            const bool enable_thinking = params_base.enable_reasoning != 0 && template_supports_thinking;
-            SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
+            SRV_TRC("%s: chat template, thinking = %d\n", __func__, enable_thinking);

            // IMPORTANT: chat_params is reused across sleeping / resuming states,
            //            never store llama_context/llama_model pointers in chat_params,
@@ -1657,7 +1661,7 @@ private:
            update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;

            if (update_cache) {
-                SRV_INF("%s", "updating prompt cache\n");
+                SRV_TRC("%s", "updating prompt cache\n");

                const int64_t t_start = ggml_time_us();

@@ -1669,7 +1673,7 @@ private:

                prompt_cache->update();

-                SRV_INF("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
+                SRV_TRC("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
            }
        }

@@ -2289,7 +2293,7 @@ private:

        int id_parent = parent_task.id;

-        SRV_INF("launching slots for parent task id_task = %d with %zu child tasks\n", id_parent, parent_task.child_tasks.size());
+        SRV_TRC("launching slots for parent task id_task = %d with %zu child tasks\n", id_parent, parent_task.child_tasks.size());

        // to be called in case of failure to release all launched slots
        auto release_slots = [this, id_parent]() {
@@ -2350,7 +2354,7 @@ private:
        // stash the draft's speculative state with the checkpoint
        common_speculative_get_state(spec.get(), slot.id, cur.data_spec);

-        SLT_INF(slot,
+        SLT_TRC(slot,
                "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
                (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
                cur.pos_max, cur.n_tokens, (float) cur.size() / 1024 / 1024);
@@ -2414,7 +2418,7 @@ private:
                    if (params_base.cache_idle_slots) {
                        for (auto & slot : slots) {
                            if (!slot.is_processing()) {
-                                SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
+                                SLT_TRC(slot, "%s", "saving idle slot to prompt cache\n");

                                if (slot.prompt_save(*prompt_cache)) {
                                    SLT_DBG(slot, "%s", "__TEST_TAG_CACHE_IDLE_SLOT__\n");
@@ -2670,7 +2674,7 @@ private:
                    auto new_loras = construct_lora_list(task.set_lora);
                    // logging
                    for (size_t i = 0; i < new_loras.size(); ++i) {
-                        SRV_INF("set lora adapter idx=%zu scale=%f\n", i, new_loras[i].scale);
+                        SRV_TRC("set lora adapter idx=%zu scale=%f\n", i, new_loras[i].scale);
                    }
                    // TODO @ngxson : make lora_adapters a dedicated member of server_context
                    params_base.lora_adapters = new_loras;
@@ -2770,7 +2774,7 @@ private:
            }

            if (all_idle) {
-                SRV_INF("%s", "all slots are idle\n");
+                SRV_TRC("%s", "all slots are idle\n");
                return; // skip further processing

            } else {
@@ -3286,10 +3290,9 @@ private:
                                    const auto it = std::find_if(
                                        slot.prompt.checkpoints.rbegin(),
                                        slot.prompt.checkpoints.rend(),
-                                        [&, func_name = __func__](const auto & cur) {
+                                        [&](const auto & cur) {
                                            // guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
-                                            LOG_INF("slot %12.*s: id %2d | task %d | Checking checkpoint with [%d, %d] against %d...\n", 12,
-                                                func_name, (slot).id, ((slot).task ? (slot).task->id : -1), cur.pos_min, cur.pos_max, pos_min_thold);
+                                            SLT_TRC(slot, "checking checkpoint with [%d, %d] against %d...\n", cur.pos_min, cur.pos_max, pos_min_thold);
                                            // workaround for [TAG_CHECKPOINTS_FIX_POS_MIN]
                                            if (cur.pos_max > pos_next) {
                                                return false;
@@ -3309,11 +3312,11 @@ private:

                                        pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
                                        n_past   = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens);
-                                        SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_past = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, n_past, (float) it->size() / 1024 / 1024);
+                                        SLT_TRC(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_past = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, n_past, (float) it->size() / 1024 / 1024);
                                    }

                                    if (do_reset) {
-                                        SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n",
+                                        SLT_TRC(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n",
                                                "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
                                        pos_next = 0;
                                        n_past = 0;
@@ -3326,7 +3329,7 @@ private:
                                for (auto it = slot.prompt.checkpoints.begin(); it != slot.prompt.checkpoints.end();) {
                                    const auto & cur = *it;
                                    if (cur.pos_max > pos_next) {
-                                        SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_swa = %d, pos_next = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, cur.n_tokens, n_swa, pos_next, (float) cur.size() / 1024 / 1024);
+                                        SLT_TRC(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_swa = %d, pos_next = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, cur.n_tokens, n_swa, pos_next, (float) cur.size() / 1024 / 1024);
                                        it = slot.prompt.checkpoints.erase(it);
                                    } else {
                                        ++it;
@@ -3673,7 +3676,7 @@ private:
                // all children slots should already launched by launch_slots_with_parent_task()
                // copy state to the child slots
                for (auto & child : children) {
-                    SLT_INF(slot, " - copying state to child %d\n", child->id);
+                    SLT_TRC(slot, " - copying state to child %d\n", child->id);

                    GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);

@@ -4022,6 +4025,15 @@ struct server_res_generator : server_http_res {
            queue_tasks.wait_until_no_sleep();
        }
    }
+    ~server_res_generator() override {
+        // cleanup() must run while rd is still alive (rd is destroyed after this body returns)
+        if (spipe) {
+            spipe->cleanup();
+        }
+    }
+    void stop() override {
+        rd.stop();
+    }
    void ok(const json & response_data) {
        status = 200;
        data = safe_json_to_str(response_data);
@@ -4210,8 +4222,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
                }
            };

+            auto effective_should_stop = stream_aware_should_stop(res_this, req.should_stop);
+
            try {
-                if (req.should_stop()) {
+                if (effective_should_stop()) {
                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
                    return false; // should_stop condition met
                }
@@ -4245,8 +4259,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
                // receive subsequent results
                bool timeout = false;
                int64_t start_time = ggml_time_ms();
-                auto result = rd.next([&timeout, &req, &start_time, &params]() {
-                    if (req.should_stop()) {
+                auto result = rd.next([&timeout, &start_time, &params, &effective_should_stop]() {
+                    if (effective_should_stop()) {
                        return true; // should_stop condition met
                    } else if (params.sse_ping_interval > 0 && ggml_time_ms() - start_time > (int64_t)params.sse_ping_interval * 1000) {
                        timeout = true;
@@ -4264,7 +4278,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(

                if (result == nullptr) {
                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
-                    GGML_ASSERT(req.should_stop());
+                    GGML_ASSERT(effective_should_stop());
                    return false; // should_stop condition met
                }

@@ -4302,6 +4316,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
        };
    }

+    // attach a producer pipe to the response when X-Conversation-Id is present.
+    // the pipe mirrors SSE chunks into the ring buffer and wires up the cancel hook.
+    stream_session_attach_pipe(*res, req.headers);
+
    return res;
 }

@@ -1,5 +1,6 @@
 #include "common.h"
 #include "server-http.h"
+#include "server-stream.h"
 #include "server-common.h"
 #include "ui.h"

@@ -82,7 +83,7 @@ bool server_http_context::init(const common_params & params) {
    hostname = params.hostname;

    if (gcp.enabled) {
-        SRV_INF("Google Cloud Platform compat: health route = %s, predict route = %s, port = %d\n", gcp.path_health.c_str(), gcp.path_predict.c_str(), gcp.port);
+        SRV_TRC("Google Cloud Platform compat: health route = %s, predict route = %s, port = %d\n", gcp.path_health.c_str(), gcp.path_predict.c_str(), gcp.port);

        if (port != gcp.port) {
            SRV_WRN("Google Cloud Platform compat: overriding server port %d with AIP_HTTP_PORT %d\n", port, gcp.port);
@@ -95,13 +96,13 @@ bool server_http_context::init(const common_params & params) {

 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
    if (!params.ssl_file_key.empty() && !params.ssl_file_cert.empty()) {
-        SRV_INF("running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
+        SRV_TRC("running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
        srv = std::make_unique<httplib::SSLServer>(
            params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()
        );
        is_ssl = true;
    } else {
-        SRV_INF("%s", "running without SSL\n");
+        SRV_TRC("%s", "running without SSL\n");
        srv = std::make_unique<httplib::Server>();
    }
 #else
@@ -164,9 +165,9 @@ bool server_http_context::init(const common_params & params) {
    if (params.api_keys.size() == 1) {
        const auto key = params.api_keys[0];
        const std::string substr = key.substr(std::max(static_cast<int>(key.length() - 4), 0));
-        SRV_INF("api_keys: ****%s\n", substr.c_str());
+        SRV_TRC("api_keys: ****%s\n", substr.c_str());
    } else if (params.api_keys.size() > 1) {
-        SRV_INF("api_keys: %zu keys loaded\n", params.api_keys.size());
+        SRV_TRC("api_keys: %zu keys loaded\n", params.api_keys.size());
    }

    //
@@ -292,7 +293,7 @@ bool server_http_context::init(const common_params & params) {
        // +4 threads for monitoring, health and some threads reserved for MCP and other tasks in the future
        n_threads_http = std::max(params.n_parallel + 4, static_cast<int32_t>(std::thread::hardware_concurrency() - 1));
    }
-    SRV_INF("using %d threads for HTTP server\n", n_threads_http);
+    SRV_TRC("using %d threads for HTTP server\n", n_threads_http);
    srv->new_task_queue = [n_threads_http] {
        // spawn n_threads_http fixed thread (always alive), while allow up to 1024 max possible additional threads
        // when n_threads_http is used, server will create new "dynamic" threads that will be destroyed after processing each request
@@ -411,13 +412,13 @@ bool server_http_context::start() {
    auto is_sock = false;
    if (string_ends_with(std::string(hostname), ".sock")) {
        is_sock = true;
-        SRV_INF("%s", "setting address family to AF_UNIX\n");
+        SRV_TRC("%s", "setting address family to AF_UNIX\n");
        srv->set_address_family(AF_UNIX);
        // bind_to_port requires a second arg, any value other than 0 should
        // simply get ignored
        was_bound = srv->bind_to_port(hostname, 8080);
    } else {
-        SRV_INF("%s", "binding port with default address family\n");
+        SRV_TRC("%s", "binding port with default address family\n");
        // bind HTTP listen port
        if (port == 0) {
            const auto bound_port = srv->bind_to_any_port(hostname);
@@ -456,13 +457,40 @@ static void set_headers(httplib::Response & res, const std::map<std::string, std
    }
 }

+// percent-decode a path component (%XX). path params arrive raw from httplib, unlike query
+// params, so a conv id like "conv::model" sent as "conv%3A%3Amodel" must be decoded here to
+// match the value the client put in the X-Conversation-Id header
+static std::string decode_path_component(const std::string & in) {
+    std::string out;
+    out.reserve(in.size());
+    for (size_t i = 0; i < in.size(); i++) {
+        if (in[i] == '%' && i + 2 < in.size()) {
+            auto hex = [](char c) -> int {
+                if (c >= '0' && c <= '9') return c - '0';
+                if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+                if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+                return -1;
+            };
+            int hi = hex(in[i + 1]);
+            int lo = hex(in[i + 2]);
+            if (hi >= 0 && lo >= 0) {
+                out.push_back(char((hi << 4) | lo));
+                i += 2;
+                continue;
+            }
+        }
+        out.push_back(in[i]);
+    }
+    return out;
+}
+
 static std::map<std::string, std::string> get_params(const httplib::Request & req) {
    std::map<std::string, std::string> params;
    for (const auto & [key, value] : req.params) {
        params[key] = value;
    }
    for (const auto & [key, value] : req.path_params) {
-        params[key] = value;
+        params[key] = decode_path_component(value);
    }
    return params;
 }
@@ -497,26 +525,41 @@ static void process_handler_response(server_http_req_ptr && request, server_http
        set_headers(res, response->headers);
        const std::string content_type = response->content_type;
        // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
-        std::shared_ptr q_ptr = std::move(request);
-        std::shared_ptr r_ptr = std::move(response);
-        const auto chunked_content_provider = [response = r_ptr](size_t, const httplib::DataSink & sink) -> bool {
+        std::shared_ptr<server_http_req> q_ptr = std::move(request);
+        std::shared_ptr<server_http_res> r_ptr = std::move(response);
+
+        const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
            std::string chunk;
            const bool has_next = response->next(chunk);
            if (!chunk.empty()) {
+                // mirror into the ring buffer first, the session must reflect every SSE chunk
+                // whether or not the wire write below succeeds
+                if (response->spipe) {
+                    response->spipe->write(chunk.data(), chunk.size());
+                }
                if (!sink.write(chunk.data(), chunk.size())) {
+                    // peer is gone, stop the wire path here
                    return false;
                }
                SRV_DBG("http: streamed chunk: %s\n", chunk.c_str());
            }
            if (!has_next) {
+                // producer reached its natural end on the wire, a later close() skips the drain
+                if (response->spipe) {
+                    response->spipe->done();
+                }
                sink.done();
                SRV_DBG("%s", "http: stream ended\n");
            }
            return has_next;
        };
        const auto on_complete = [request = q_ptr, response = r_ptr](bool) mutable {
-            response.reset(); // trigger the destruction of the response object
-            request.reset();  // trigger the destruction of the request object
+            // on a dropped peer, close() drains the rest of the generation into the ring buffer
+            if (response->spipe) {
+                response->spipe->close();
+            }
+            response.reset(); // spipe destructor finalizes the session if attached
+            request.reset();
        };
        res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
    } else {
@@ -3,6 +3,7 @@
 #include <atomic>
 #include <functional>
 #include <map>
+#include <memory>
 #include <string>
 #include <thread>
 #include <vector>
@@ -10,6 +11,7 @@
 #include <unordered_map>

 struct common_params;
+struct stream_pipe_producer; // defined in server-stream.h

 // generator-like API for HTTP response generation
 // this object response with one of the 2 modes:
@@ -23,12 +25,20 @@ struct server_http_res {
    std::string data;
    std::map<std::string, std::string> headers;

-    // TODO: move this to a virtual function once we have proper polymorphism support
+    // if set, the stream survives a client disconnect: the producer pipe keeps draining into the
+    // ring buffer and finalizes the session on destruction, so no explicit on_stream_end is needed.
+    // shared_ptr (not unique_ptr) so the forward-declared type is safe to delete here.
+    std::shared_ptr<stream_pipe_producer> spipe;
+
    std::function<bool(std::string &)> next = nullptr;
    bool is_stream() const {
        return next != nullptr;
    }

+    // called when the session is cancelled (e.g. DELETE /v1/stream/<conv_id>).
+    // server_res_generator overrides this to stop its reader; the default is a no-op.
+    virtual void stop() {}
+
    virtual ~server_http_res() = default;
 };

@@ -1,12 +1,14 @@
 #include "server-common.h"
 #include "server-models.h"
 #include "server-context.h"
+#include "server-stream.h"

 #include "build-info.h"
 #include "preset.h"
 #include "download.h"

 #include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
+#include <optional>
 #include <sheredom/subprocess.h>

 #include <functional>
@@ -92,6 +94,9 @@ struct server_subproc {
    }
 };

+// short loopback budget for the resumable stream router to child JSON calls (probe, lookup,
+// delete). distinct from params.timeout_read/write which only applies to the generation proxy
+static constexpr int STREAM_LOOKUP_TIMEOUT_MS = 250;

 static std::filesystem::path get_server_exec_path() {
 #if defined(_WIN32)
@@ -1580,6 +1585,45 @@ static bool is_autoload(const common_params & params, const server_http_req & re
    }
 }

+// percent encode one query or path component, covers reserved chars without pulling in
+// httplib::detail. used by the stream routes to forward conversation_id to children safely
+static std::string encode_qs(const std::string & in) {
+    std::string out;
+    out.reserve(in.size() * 3);
+    for (unsigned char c : in) {
+        bool safe = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')
+                 || c == '-' || c == '_' || c == '.' || c == '~';
+        if (safe) {
+            out.push_back(char(c));
+        } else {
+            char buf[4];
+            std::snprintf(buf, sizeof(buf), "%%%02X", c);
+            out.append(buf, 3);
+        }
+    }
+    return out;
+}
+
+// resolve the child that owns a conversation's stream session via the conv_id -> model map
+// populated when the POST was routed. single map lookup then a meta lookup, no polling, no
+// parsing of the conv id. returns nullopt when nothing maps, the caller answers not found and
+// the client recovers
+static std::optional<server_model_meta> resolve_child_for_conv(
+        server_models & models, const std::string & conversation_id) {
+    if (conversation_id.empty()) {
+        return std::nullopt;
+    }
+    auto tracked = models.conv_models.lookup(conversation_id);
+    if (!tracked.has_value()) {
+        return std::nullopt;
+    }
+    auto meta = models.get_meta(*tracked);
+    if (meta.has_value() && meta->is_ready()) {
+        return meta;
+    }
+    return std::nullopt;
+}
+
 void server_models_routes::init_routes() {
    this->get_router_props = [this](const server_http_req & req) {
        std::string name = req.get_param("model");
@@ -1628,6 +1672,12 @@ void server_models_routes::init_routes() {
        if (!router_validate_model(name, models, autoload, error_res)) {
            return error_res;
        }
+        // remember which child serves this conversation so the stream routes can route straight
+        // to it without polling, keyed on the exact conv id from the header
+        std::string conv_id = stream_conv_id_from_headers(req.headers);
+        if (!conv_id.empty()) {
+            models.conv_models.remember(conv_id, name);
+        }
        return models.proxy_request(req, method, name, true); // update last usage for POST request only
    };

@@ -1819,6 +1869,128 @@ void server_models_routes::init_routes() {
        res_ok(res, {{"success", true}});
        return res;
    };
+
+    this->router_stream_get = [this](const server_http_req & req) {
+        // GET /v1/stream/<conv_id>?from=N. resolve the owning child from the conv_id -> model
+        // map, 404 when nothing maps
+        auto res = std::make_unique<server_http_res>();
+        std::string conv_id = req.get_param("conv_id");
+        if (conv_id.empty()) {
+            res_err(res, format_error_response("Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        std::optional<server_model_meta> owner = resolve_child_for_conv(models, conv_id);
+        if (!owner.has_value()) {
+            res_err(res, format_error_response("Stream not found or expired", ERROR_TYPE_NOT_FOUND));
+            return res;
+        }
+        std::string from = req.get_param("from");
+        std::string child_path = "/v1/stream/" + encode_qs(conv_id);
+        if (!from.empty()) {
+            child_path += "?from=" + from;
+        }
+        SRV_INF("proxying stream resume to model %s on port %d, path=%s\n",
+                owner->name.c_str(), owner->port, child_path.c_str());
+        auto proxy = std::make_unique<server_http_proxy>(
+                "GET",
+                "http",
+                CHILD_ADDR,
+                owner->port,
+                child_path,
+                req.headers,
+                req.body,
+                req.files,
+                req.should_stop,
+                params.timeout_read,
+                params.timeout_write);
+        return std::unique_ptr<server_http_res>(std::move(proxy));
+    };
+
+    this->router_streams_lookup = [this](const server_http_req & req) {
+        // POST /v1/streams/lookup. resolve each requested conv id to its owning child via the
+        // map, group the ids per child, and query only the children that actually own some of
+        // them instead of fanning out to every ready child. a child only answers for the ids
+        // it owns, never lists anything else
+        auto res = std::make_unique<server_http_res>();
+        std::vector<std::string> requested;
+        try {
+            json body = json::parse(req.body);
+            if (body.contains("conversation_ids") && body["conversation_ids"].is_array()) {
+                for (const auto & v : body["conversation_ids"]) {
+                    if (v.is_string() && !v.get<std::string>().empty()) {
+                        requested.push_back(v.get<std::string>());
+                    }
+                }
+            }
+        } catch (const std::exception &) {
+            res_ok(res, json::array());
+            return res;
+        }
+
+        // group requested ids by the child port that owns them, drop ids that map to nothing
+        std::unordered_map<int, json> per_child;
+        for (const auto & cid : requested) {
+            auto owner = resolve_child_for_conv(models, cid);
+            if (!owner.has_value()) {
+                continue;
+            }
+            per_child[owner->port].push_back(cid);
+        }
+
+        json aggregated = json::array();
+        for (auto & [port, ids] : per_child) {
+            json child_body = {{"conversation_ids", ids}};
+            httplib::Client cli(CHILD_ADDR, port);
+            cli.set_connection_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
+            cli.set_read_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
+            cli.set_write_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
+            auto resp = cli.Post("/v1/streams/lookup", child_body.dump(), "application/json");
+            if (!resp || resp->status != 200) {
+                continue;
+            }
+            try {
+                json child_arr = json::parse(resp->body);
+                if (!child_arr.is_array()) {
+                    continue;
+                }
+                for (auto & entry : child_arr) {
+                    if (entry.is_object()) {
+                        aggregated.push_back(entry);
+                    }
+                }
+            } catch (const std::exception &) {
+                continue;
+            }
+        }
+        res_ok(res, aggregated);
+        return res;
+    };
+
+    this->router_stream_delete = [this](const server_http_req & req) {
+        // DELETE /v1/stream/<conv_id>. resolve the owning child via the map and forward only to
+        // it, evict_and_cancel is idempotent on the child
+        auto res = std::make_unique<server_http_res>();
+        std::string conv_id = req.get_param("conv_id");
+        if (conv_id.empty()) {
+            res_err(res, format_error_response("Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        std::string child_path = "/v1/stream/" + encode_qs(conv_id);
+        auto owner = resolve_child_for_conv(models, conv_id);
+        if (owner.has_value()) {
+            httplib::Client cli(CHILD_ADDR, owner->port);
+            cli.set_connection_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
+            cli.set_read_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
+            cli.set_write_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
+            auto resp = cli.Delete(child_path.c_str());
+            (void) resp; // best effort, 404 and network errors are equivalent to no op
+        }
+        // drop the tracking entry, the session is being torn down
+        models.conv_models.forget(conv_id);
+        res->status = 204;
+        res->content_type = "application/json";
+        return res;
+    };
 }


@@ -11,7 +11,10 @@
 #include <condition_variable>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <set>
+#include <string>
+#include <unordered_map>

 /**
 * state diagram:
@@ -126,6 +129,44 @@ private:
    // if true, the next get_meta() will trigger a reload of model list
    bool need_reload = false;

+    // conv_id -> model name that currently serves its stream session, lets the resumable stream
+    // routes go straight to the owning child instead of polling every one. populated when
+    // proxy_request forwards a POST carrying an X-Conversation-Id. best effort: a stale entry just
+    // makes the child answer not found and the client recovers. owns its lock, one mutex per struct
+    struct conv_model_tracker {
+        void remember(const std::string & conv_id, const std::string & model) {
+            if (conv_id.empty() || model.empty()) {
+                return;
+            }
+            std::lock_guard<std::mutex> lock(mu);
+            map[conv_id] = model;
+        }
+
+        std::optional<std::string> lookup(const std::string & conv_id) {
+            if (conv_id.empty()) {
+                return std::nullopt;
+            }
+            std::lock_guard<std::mutex> lock(mu);
+            auto it = map.find(conv_id);
+            if (it == map.end()) {
+                return std::nullopt;
+            }
+            return it->second;
+        }
+
+        void forget(const std::string & conv_id) {
+            if (conv_id.empty()) {
+                return;
+            }
+            std::lock_guard<std::mutex> lock(mu);
+            map.erase(conv_id);
+        }
+
+      private:
+        std::mutex                                   mu;
+        std::unordered_map<std::string, std::string> map;
+    };
+
    common_preset_context ctx_preset;

    common_params base_params;
@@ -145,6 +186,9 @@ private:
    void notify_sse(const std::string & event, const std::string & model_id, const json & data = nullptr);

 public:
+    // conv_id -> model tracker for the resumable stream routes, owns its lock
+    conv_model_tracker conv_models;
+
    server_models(const common_params & params, int argc, char ** argv);

    server_response sse; // for real-time updates via SSE endpoint
@@ -268,6 +312,12 @@ struct server_models_routes {
    server_http_context::handler_t get_router_models_sse;
    server_http_context::handler_t post_router_models;
    server_http_context::handler_t del_router_models;
+
+    // router side handlers for the resumable streaming routes. each resolves the child that owns
+    // a conversation through the conv_id -> model map, no probing or fan out
+    server_http_context::handler_t router_stream_get;
+    server_http_context::handler_t router_streams_lookup;
+    server_http_context::handler_t router_stream_delete;
 };

 /**
@@ -287,7 +287,7 @@ std::vector<std::unique_ptr<field>> make_llama_cmpl_schema(const common_params &
        ->set_desc("Chat format used internally by the server")
        ->set_handler([&](field_eval_context & ctx, const json & data) {
            ctx.params.chat_parser_params.format = static_cast<common_chat_format>(data.at("chat_format").get<int>());
-            SRV_INF("Chat format: %s\n", common_chat_format_name(ctx.params.chat_parser_params.format));
+            SRV_TRC("chat format: %s\n", common_chat_format_name(ctx.params.chat_parser_params.format));
        }));

    add((new field_str("reasoning_format"))
@@ -0,0 +1,568 @@
+#include "server-stream.h"
+#include "server-common.h"
+#include "server-http.h"
+#include "server-queue.h"
+
+#include <chrono>
+#include <memory>
+#include <utility>
+
+namespace {
+constexpr int64_t STREAM_SESSION_TTL_SECONDS         = 300;
+constexpr size_t  STREAM_SESSION_MAX_BYTES           = 4 * 1024 * 1024;
+constexpr int64_t STREAM_SESSION_GC_INTERVAL_SECONDS = 60;
+constexpr int64_t STREAM_READ_WAKE_INTERVAL_MS       = 200;
+
+// returns unix time in seconds
+int64_t now_seconds() {
+    return std::chrono::duration_cast<std::chrono::seconds>(
+        std::chrono::system_clock::now().time_since_epoch()
+    ).count();
+}
+}
+
+stream_session::stream_session(std::string conversation_id_, size_t max_bytes_)
+    : conversation_id(std::move(conversation_id_))
+    , started_ts(now_seconds())
+    , prefix_dropped(0)
+    , cap_bytes(max_bytes_)
+    , done(false)
+    , cancelled(false)
+    , completed_ts(0) {
+    buffer.reserve(64 * 1024);
+}
+
+bool stream_session::append(const char * data, size_t len) {
+    if (len == 0) {
+        return true;
+    }
+    {
+        std::lock_guard<std::mutex> lock(mu);
+        if (done.load(std::memory_order_relaxed)) {
+            return false;
+        }
+        if (len >= cap_bytes) {
+            // single chunk bigger than the cap, keep only the tail that fits
+            size_t skip = len - cap_bytes;
+            prefix_dropped += buffer.size() + skip;
+            buffer.clear();
+            buffer.insert(buffer.end(), data + skip, data + len);
+        } else {
+            size_t needed = buffer.size() + len;
+            if (needed > cap_bytes) {
+                size_t to_drop = needed - cap_bytes;
+                buffer.erase(buffer.begin(), buffer.begin() + to_drop);
+                prefix_dropped += to_drop;
+            }
+            buffer.insert(buffer.end(), data, data + len);
+        }
+    }
+    cv.notify_all();
+    return true;
+}
+
+void stream_session::finalize() {
+    bool was_done = done.exchange(true, std::memory_order_acq_rel);
+    if (was_done) {
+        return;
+    }
+    completed_ts.store(now_seconds(), std::memory_order_release);
+    cv.notify_all();
+}
+
+stream_read_status stream_session::read_from(size_t offset,
+        const std::function<bool(const char *, size_t)> & sink,
+        const std::function<bool()> & should_stop) {
+    std::unique_lock<std::mutex> lock(mu);
+    while (true) {
+        if (should_stop && should_stop()) {
+            return stream_read_status::OK;
+        }
+        if (offset < prefix_dropped) {
+            return stream_read_status::OFFSET_LOST;
+        }
+        size_t logical_end = prefix_dropped + buffer.size();
+        if (offset < logical_end) {
+            size_t local_off = offset - prefix_dropped;
+            size_t n         = buffer.size() - local_off;
+            // copy the available chunk under the lock, release before calling the sink
+            std::vector<char> chunk(buffer.begin() + local_off, buffer.begin() + local_off + n);
+            offset += n;
+            lock.unlock();
+            bool keep_going = sink(chunk.data(), chunk.size());
+            if (!keep_going) {
+                return stream_read_status::OK;
+            }
+            lock.lock();
+            continue;
+        }
+        if (done.load(std::memory_order_acquire)) {
+            return stream_read_status::OK;
+        }
+        // wait for new bytes, finalize, or a periodic wake to re check should_stop
+        cv.wait_for(lock, std::chrono::milliseconds(STREAM_READ_WAKE_INTERVAL_MS));
+    }
+}
+
+bool stream_session::is_done() const {
+    return done.load(std::memory_order_acquire);
+}
+
+size_t stream_session::total_size() const {
+    std::lock_guard<std::mutex> lock(mu);
+    return prefix_dropped + buffer.size();
+}
+
+size_t stream_session::dropped_prefix() const {
+    std::lock_guard<std::mutex> lock(mu);
+    return prefix_dropped;
+}
+
+int64_t stream_session::completed_at() const {
+    return completed_ts.load(std::memory_order_acquire);
+}
+
+void stream_session::set_stop_producer(std::function<void()> fn) {
+    std::lock_guard<std::mutex> lock(mu);
+    stop_producer = std::move(fn);
+}
+
+void stream_session::cancel() {
+    // flip cancelled first so the producer-side stream_aware_should_stop can break out of the
+    // recv() wait even if remove_waiting_task_ids does not notify the condvar (the cancel task
+    // posted by rd.stop() will eventually notify, but we do not want to depend on that timing)
+    cancelled.store(true, std::memory_order_release);
+    // copy the hook under the lock then invoke outside, the producer side may grab queue locks
+    // and we do not want to hold our mu across that path
+    std::function<void()> fn;
+    {
+        std::lock_guard<std::mutex> lock(mu);
+        fn = stop_producer;
+    }
+    if (fn) {
+        fn();
+    }
+}
+
+bool stream_session::is_cancelled() const {
+    return cancelled.load(std::memory_order_acquire);
+}
+
+stream_session_manager::stream_session_manager()
+    : running(false) {
+}
+
+stream_session_manager::~stream_session_manager() {
+    stop_gc();
+}
+
+stream_session_ptr stream_session_manager::create_or_replace(const std::string & conversation_id) {
+    // evict any previous session on the same conv, this guarantees the invariant
+    // "one conv = at most one live session" and propagates cancel to its producer
+    stream_session_ptr previous;
+    auto fresh = std::make_shared<stream_session>(conversation_id, STREAM_SESSION_MAX_BYTES);
+    {
+        std::unique_lock<std::shared_mutex> lock(map_mu);
+        auto it = sessions.find(conversation_id);
+        if (it != sessions.end()) {
+            previous = it->second;
+            it->second = fresh;
+        } else {
+            sessions.emplace(conversation_id, fresh);
+        }
+    }
+    if (previous) {
+        previous->cancel();
+        previous->finalize();
+    }
+    return fresh;
+}
+
+stream_session_ptr stream_session_manager::get(const std::string & conversation_id) {
+    std::shared_lock<std::shared_mutex> lock(map_mu);
+    auto it = sessions.find(conversation_id);
+    if (it == sessions.end()) {
+        return nullptr;
+    }
+    return it->second;
+}
+
+std::vector<stream_session_ptr> stream_session_manager::list_all() const {
+    std::vector<stream_session_ptr> out;
+    std::shared_lock<std::shared_mutex> lock(map_mu);
+    out.reserve(sessions.size());
+    for (auto & kv : sessions) {
+        out.push_back(kv.second);
+    }
+    return out;
+}
+
+void stream_session_manager::evict(const std::string & conversation_id) {
+    stream_session_ptr s;
+    {
+        std::unique_lock<std::shared_mutex> lock(map_mu);
+        auto it = sessions.find(conversation_id);
+        if (it == sessions.end()) {
+            return;
+        }
+        s = it->second;
+        sessions.erase(it);
+    }
+    // finalize outside the map lock so any pending readers wake up and exit
+    s->finalize();
+}
+
+void stream_session_manager::evict_and_cancel(const std::string & conversation_id) {
+    stream_session_ptr s;
+    {
+        std::unique_lock<std::shared_mutex> lock(map_mu);
+        auto it = sessions.find(conversation_id);
+        if (it == sessions.end()) {
+            return;
+        }
+        s = it->second;
+        sessions.erase(it);
+    }
+    // signal the producer side first so the inference is cancelled at the queue level,
+    // then finalize, which wakes any pending HTTP reader and lets the drain exit naturally
+    s->cancel();
+    s->finalize();
+}
+
+void stream_session_manager::start_gc() {
+    if (running.exchange(true)) {
+        return;
+    }
+    gc_thread = std::thread([this] { gc_loop(); });
+}
+
+void stream_session_manager::stop_gc() {
+    bool was_running = running.exchange(false);
+    if (was_running) {
+        {
+            std::lock_guard<std::mutex> lock(gc_wake_mu);
+        }
+        gc_wake_cv.notify_all();
+        if (gc_thread.joinable()) {
+            gc_thread.join();
+        }
+    }
+    // finalize all live sessions so no reader ever hangs
+    std::vector<stream_session_ptr> snapshot;
+    {
+        std::unique_lock<std::shared_mutex> lock(map_mu);
+        snapshot.reserve(sessions.size());
+        for (auto & kv : sessions) {
+            snapshot.push_back(kv.second);
+        }
+        sessions.clear();
+    }
+    for (auto & s : snapshot) {
+        s->finalize();
+    }
+}
+
+void stream_session_manager::gc_loop() {
+    while (running.load(std::memory_order_acquire)) {
+        {
+            std::unique_lock<std::mutex> lock(gc_wake_mu);
+            gc_wake_cv.wait_for(lock,
+                std::chrono::seconds(STREAM_SESSION_GC_INTERVAL_SECONDS),
+                [this] { return !running.load(std::memory_order_acquire); });
+        }
+        if (!running.load(std::memory_order_acquire)) {
+            return;
+        }
+        int64_t cutoff = now_seconds() - STREAM_SESSION_TTL_SECONDS;
+        std::vector<stream_session_ptr> to_drop;
+        {
+            std::unique_lock<std::shared_mutex> lock(map_mu);
+            for (auto it = sessions.begin(); it != sessions.end(); ) {
+                int64_t completed = it->second->completed_at();
+                if (completed != 0 && completed <= cutoff) {
+                    to_drop.push_back(it->second);
+                    it = sessions.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+        }
+        // finalize outside the map lock, idempotent if the session was already done
+        for (auto & s : to_drop) {
+            s->finalize();
+        }
+    }
+}
+
+// process wide manager, lifecycle controlled by llama-server main() via start_gc/stop_gc
+stream_session_manager g_stream_sessions;
+
+// stream_pipe ---------------------------------------------------------------------------------
+
+stream_pipe::stream_pipe(stream_session_ptr session)
+    : session_(std::move(session)) {
+}
+
+bool stream_pipe::is_cancelled() const {
+    return session_->is_cancelled();
+}
+
+// stream_pipe_producer
+
+stream_pipe_producer::stream_pipe_producer(stream_session_ptr session)
+    : stream_pipe(std::move(session)) {
+}
+
+stream_pipe_producer::~stream_pipe_producer() {
+    cleanup();
+    session_->finalize();
+}
+
+void stream_pipe_producer::cleanup() {
+    if (!alive_) {
+        return;
+    }
+    alive_->store(false, std::memory_order_release);
+    session_->set_stop_producer(nullptr);
+    alive_.reset();
+}
+
+bool stream_pipe_producer::write(const char * data, size_t len) {
+    return session_->append(data, len);
+}
+
+void stream_pipe_producer::done() {
+    done_ = true;
+}
+
+void stream_pipe_producer::close() {
+    // httplib bails its content provider the moment is_peer_alive() goes false, so pump the rest
+    // of the generation into the ring buffer here. a DELETE flips is_cancelled and cuts it short
+    if (done_ || session_->is_cancelled()) {
+        SRV_TRC("stream_pipe close: skip drain (done=%d cancelled=%d) conv=%s\n",
+                done_ ? 1 : 0, session_->is_cancelled() ? 1 : 0, session_->conversation_id.c_str());
+        return;
+    }
+    SRV_TRC("stream_pipe close: draining conv=%s\n", session_->conversation_id.c_str());
+    size_t drained = 0;
+    std::string chunk;
+    while (true) {
+        chunk.clear();
+        bool has_next = res_->next(chunk);
+        if (!chunk.empty()) {
+            write(chunk.data(), chunk.size());
+            drained += chunk.size();
+        }
+        if (!has_next) {
+            break;
+        }
+    }
+    SRV_TRC("stream_pipe close: drain ended conv=%s bytes=%zu\n", session_->conversation_id.c_str(), drained);
+}
+
+std::shared_ptr<stream_pipe_producer> stream_pipe_producer::create(stream_session_ptr session,
+                                                                   server_http_res & res) {
+    auto alive = std::make_shared<std::atomic<bool>>(true);
+    auto * res_ptr = &res;
+    session->set_stop_producer([alive, res_ptr]() {
+        if (alive->load(std::memory_order_acquire)) {
+            res_ptr->stop();
+        }
+    });
+    auto pipe = std::shared_ptr<stream_pipe_producer>(new stream_pipe_producer(std::move(session)));
+    pipe->alive_ = std::move(alive);
+    pipe->res_   = res_ptr;
+    return pipe;
+}
+
+// stream_pipe_consumer
+
+stream_pipe_consumer::stream_pipe_consumer(stream_session_ptr session)
+    : stream_pipe(std::move(session)) {
+}
+
+stream_read_status stream_pipe_consumer::read(size_t & offset,
+        const std::function<bool(const char *, size_t)> & sink,
+        const std::function<bool()> & should_stop) {
+    return session_->read_from(offset, sink, should_stop);
+}
+
+std::shared_ptr<stream_pipe_consumer> stream_pipe_consumer::create(stream_session_ptr session) {
+    return std::shared_ptr<stream_pipe_consumer>(new stream_pipe_consumer(std::move(session)));
+}
+
+// helper, builds the standard error response and assigns it to a brand new http_res
+static server_http_res_ptr make_error_response(int status, const std::string & message, error_type type) {
+    auto res = std::make_unique<server_http_res>();
+    json err = format_error_response(message, type);
+    res->status = json_value(err, "code", status);
+    res->content_type = "application/json; charset=utf-8";
+    res->data = safe_json_to_str({{"error", err}});
+    return res;
+}
+
+server_http_context::handler_t make_stream_get_handler() {
+    return [](const server_http_req & req) -> server_http_res_ptr {
+        // GET /v1/stream/<conv_id>?from=N replays the SSE bytes already buffered for the
+        // session, blocks for more bytes when the session is still running, returns when
+        // the session is finalized. the body is streamed back as text/event-stream so the
+        // browser EventSource can attach to it like a fresh request
+        std::string conv_id = req.get_param("conv_id");
+        if (conv_id.empty()) {
+            return make_error_response(400, "Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST);
+        }
+        auto session = g_stream_sessions.get(conv_id);
+        if (!session) {
+            return make_error_response(404, "Stream not found or expired", ERROR_TYPE_NOT_FOUND);
+        }
+        size_t from = 0;
+        std::string from_str = req.get_param("from");
+        if (!from_str.empty()) {
+            try {
+                from = static_cast<size_t>(std::stoull(from_str));
+            } catch (const std::exception &) {
+                return make_error_response(400, "Invalid 'from' offset", ERROR_TYPE_INVALID_REQUEST);
+            }
+        }
+        if (from < session->dropped_prefix()) {
+            return make_error_response(400, "Stream offset lost, please restart", ERROR_TYPE_INVALID_REQUEST);
+        }
+        auto res = std::make_unique<server_http_res>();
+        res->status = 200;
+        res->content_type = "text/event-stream";
+        // the next closure reads from the ring buffer at the requested offset, blocks until
+        // bytes arrive or the session finalizes. exit each call after draining the available
+        // chunk so set_chunked_content_provider gets a chance to flush to the socket
+        auto offset_ptr = std::make_shared<size_t>(from);
+        // consumer pipe: read-only, does not finalize the session on destruction
+        auto pipe = stream_pipe_consumer::create(session);
+        res->next = [pipe, offset_ptr, &req](std::string & output) -> bool {
+            bool got_any = false;
+            pipe->read(*offset_ptr,
+                [&](const char * d, size_t n) {
+                    output.append(d, n);
+                    *offset_ptr += n;
+                    got_any = true;
+                    return false;
+                },
+                req.should_stop);
+            return got_any;
+        };
+        return res;
+    };
+}
+
+server_http_context::handler_t make_streams_lookup_handler() {
+    return [](const server_http_req & req) -> server_http_res_ptr {
+        // POST /v1/streams/lookup with body {"conversation_ids": ["X", "Y", ...]} returns the
+        // matching sessions, only for ids the caller already knows. each id matches the exact key
+        // and any "<id>::<model>" variant, so one lookup covers every per model session for a conv
+        std::vector<std::string> requested;
+        try {
+            json body = json::parse(req.body);
+            if (body.contains("conversation_ids") && body["conversation_ids"].is_array()) {
+                for (const auto & v : body["conversation_ids"]) {
+                    if (v.is_string()) {
+                        std::string id = v.get<std::string>();
+                        if (!id.empty()) {
+                            requested.push_back(std::move(id));
+                        }
+                    }
+                }
+            }
+        } catch (const std::exception & e) {
+            auto res = std::make_unique<server_http_res>();
+            res->status = 400;
+            res->content_type = "application/json; charset=utf-8";
+            res->data = safe_json_to_str({{"error", {{"message", std::string("invalid body: ") + e.what()},
+                                                     {"type", "invalid_request_error"}}}});
+            return res;
+        }
+
+        std::vector<stream_session_ptr> sessions;
+        if (!requested.empty()) {
+            auto all = g_stream_sessions.list_all();
+            for (const auto & rid : requested) {
+                const std::string with_sep = rid + "::";
+                for (auto & s : all) {
+                    if (s->conversation_id == rid ||
+                        s->conversation_id.compare(0, with_sep.size(), with_sep) == 0) {
+                        sessions.push_back(s);
+                    }
+                }
+            }
+        }
+
+        json arr = json::array();
+        for (auto & s : sessions) {
+            arr.push_back({
+                {"conversation_id", s->conversation_id},
+                {"is_done",         s->is_done()},
+                {"total_bytes",     s->total_size()},
+                {"started_at",      s->started_ts},
+                {"completed_at",    s->completed_at()},
+            });
+        }
+        auto res = std::make_unique<server_http_res>();
+        res->status = 200;
+        res->content_type = "application/json; charset=utf-8";
+        res->data = safe_json_to_str(arr);
+        return res;
+    };
+}
+
+server_http_context::handler_t make_stream_delete_handler() {
+    return [](const server_http_req & req) -> server_http_res_ptr {
+        // DELETE /v1/stream/<conv_id> is the explicit user Stop, cancels the producer hook
+        // wired by handle_completions_impl and evicts the buffer. idempotent, a session that
+        // already finalized or was never created returns 204 either way
+        std::string conv_id = req.get_param("conv_id");
+        if (conv_id.empty()) {
+            return make_error_response(400, "Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST);
+        }
+        SRV_TRC("DELETE /v1/stream/%s -> evict_and_cancel\n", conv_id.c_str());
+        g_stream_sessions.evict_and_cancel(conv_id);
+        auto res = std::make_unique<server_http_res>();
+        res->status = 204;
+        res->content_type = "application/json";
+        return res;
+    };
+}
+
+std::string stream_conv_id_from_headers(const std::map<std::string, std::string> & headers) {
+    // case-insensitive scan for x-conversation-id
+    static constexpr char   target[]   = "x-conversation-id";
+    static constexpr size_t target_len = sizeof(target) - 1;
+    for (const auto & [hk, hv] : headers) {
+        if (hk.size() != target_len) continue;
+        bool match = true;
+        for (size_t i = 0; i < target_len; ++i) {
+            char c = hk[i];
+            if (c >= 'A' && c <= 'Z') c = char(c + 32);
+            if (c != target[i]) { match = false; break; }
+        }
+        if (match) {
+            return hv;
+        }
+    }
+    return std::string();
+}
+
+void stream_session_attach_pipe(server_http_res & res, const std::map<std::string, std::string> & headers) {
+    std::string conversation_id = stream_conv_id_from_headers(headers);
+    SRV_TRC("conv_id=%s (empty=%d)\n", conversation_id.c_str(), conversation_id.empty() ? 1 : 0);
+    if (conversation_id.empty()) {
+        return;
+    }
+    auto session = g_stream_sessions.create_or_replace(conversation_id);
+    res.spipe = stream_pipe_producer::create(session, res);
+}
+
+std::function<bool()> stream_aware_should_stop(server_http_res * res, std::function<bool()> fallback) {
+    return [res, fallback = std::move(fallback)]() -> bool {
+        if (res->spipe) {
+            return res->spipe->is_cancelled();
+        }
+        return fallback();
+    };
+}
@@ -0,0 +1,203 @@
+#pragma once
+
+#include "server-http.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+enum class stream_read_status {
+    OK,
+    OFFSET_LOST,
+};
+
+// streaming buffer for one generation, survives HTTP disconnect. the producer appends raw SSE
+// bytes, readers drain from any offset via read_from and block until more bytes or finalize.
+// keyed by conversation_id: one conv = at most one live session
+struct stream_session {
+    std::string conversation_id;
+    int64_t     started_ts; // unix seconds at construction, used by /v1/streams listing
+
+    stream_session(std::string conversation_id_, size_t max_bytes_);
+    stream_session(const stream_session &)             = delete;
+    stream_session & operator=(const stream_session &) = delete;
+
+    // append raw bytes, drops from the front if the cap is reached.
+    // returns false if the session is already finalized
+    bool append(const char * data, size_t len);
+
+    // mark the session as complete, wakes all pending readers
+    void finalize();
+
+    // drain bytes from offset, calling sink for each chunk. blocks until more
+    // bytes arrive or finalize is called. returns OK on clean exit, OFFSET_LOST
+    // if offset falls below the dropped prefix
+    stream_read_status read_from(size_t offset,
+        const std::function<bool(const char *, size_t)> & sink,
+        const std::function<bool()> & should_stop);
+
+    bool    is_done() const;
+    bool    is_cancelled() const;
+    size_t  total_size() const;     // bytes that ever entered the session
+    size_t  dropped_prefix() const; // bytes evicted from the front due to cap
+    int64_t completed_at() const;   // 0 while alive, unix seconds after finalize
+
+    // attach the producer stop hook used to cancel its reader, pass an empty function to detach
+    void set_stop_producer(std::function<void()> fn);
+
+    // signal the producer to abort its inference asap via the stop hook, idempotent
+    void cancel();
+
+private:
+    mutable std::mutex      mu;
+    std::condition_variable cv;
+    std::vector<char>       buffer;
+    size_t                  prefix_dropped;
+    size_t                  cap_bytes;
+    std::atomic<bool>       done;
+    std::atomic<bool>       cancelled;
+    std::atomic<int64_t>    completed_ts;
+    std::function<void()>   stop_producer; // protected by mu
+};
+
+using stream_session_ptr = std::shared_ptr<stream_session>;
+
+// one end of a stream_session pipe. the base holds the session and the shared query, the
+// producer and consumer ends derive from it. virtual dtor so each end runs its own teardown:
+// the producer finalizes the session, the consumer leaves it untouched
+struct stream_pipe {
+    virtual ~stream_pipe() = default;
+
+    // true if the session was cancelled (e.g. via DELETE /v1/stream/<conv_id>)
+    bool is_cancelled() const;
+
+protected:
+    explicit stream_pipe(stream_session_ptr session);
+
+    stream_session_ptr session_;
+};
+
+// producer end: writes chunks into the ring buffer and owns the session lifetime, finalizing it
+// on destruction.
+//
+// lifetime safety: holds a shared_ptr<atomic<bool>> alive also captured by the session's
+// stop_producer hook. cleanup() sets alive=false and clears the hook; it must run while the
+// response the hook calls stop() on is still alive. ~server_res_generator() does this explicitly.
+struct stream_pipe_producer : stream_pipe {
+    ~stream_pipe_producer() override;
+
+    // append raw bytes to the session's ring buffer, returns false if already finalized
+    bool write(const char * data, size_t len);
+
+    // mark the natural end on the wire so a later close() is a no-op
+    void done();
+
+    // on a peer drop, pump the response next() into the ring buffer until done. runs on the http
+    // worker from on_complete, no-op after done() or cancel
+    void close();
+
+    // disarm the stop hook and drop the alive guard, must run while the response the hook
+    // references is still alive. idempotent, the destructor calls it too
+    void cleanup();
+
+    // res.stop() is invoked when the session is cancelled, the alive guard ensures stop() is not
+    // called after cleanup() has run
+    static std::shared_ptr<stream_pipe_producer> create(stream_session_ptr session, server_http_res & res);
+
+private:
+    explicit stream_pipe_producer(stream_session_ptr session);
+
+    bool                                done_ = false;
+    std::shared_ptr<std::atomic<bool>>  alive_;
+    server_http_res *                   res_ = nullptr;
+};
+
+// consumer end: read-only replay of the ring buffer, the destructor does not finalize the session
+struct stream_pipe_consumer : stream_pipe {
+    // drain bytes from offset, calling sink for each available chunk. blocks until more data
+    // arrives or the session finalizes. should_stop is polled, returns OFFSET_LOST if offset
+    // fell below the dropped prefix
+    stream_read_status read(size_t & offset,
+        const std::function<bool(const char *, size_t)> & sink,
+        const std::function<bool()> & should_stop);
+
+    static std::shared_ptr<stream_pipe_consumer> create(stream_session_ptr session);
+
+private:
+    explicit stream_pipe_consumer(stream_session_ptr session);
+};
+
+// owns all live sessions, runs a periodic GC to evict expired ones.
+// the map is keyed by conversation_id, so the invariant "one conv = at most one
+// live session" is enforced at the type level
+class stream_session_manager {
+public:
+    stream_session_manager();
+    ~stream_session_manager();
+
+    stream_session_manager(const stream_session_manager &)             = delete;
+    stream_session_manager & operator=(const stream_session_manager &) = delete;
+
+    // install a new session for this conversation, evicting and cancelling any previous one.
+    // the conversation_id must be non empty, the caller is responsible for that check.
+    // returns the new session
+    stream_session_ptr create_or_replace(const std::string & conversation_id);
+
+    // lookup, returns null if unknown or already evicted
+    stream_session_ptr get(const std::string & conversation_id);
+
+    // list every live or recently completed session, used by GET /v1/streams without filter
+    std::vector<stream_session_ptr> list_all() const;
+
+    // remove from the map and finalize, wakes any pending readers
+    void evict(const std::string & conversation_id);
+
+    // signal the producer to cancel asap then evict, used by the explicit user Stop path
+    void evict_and_cancel(const std::string & conversation_id);
+
+    void start_gc();
+    void stop_gc();
+
+private:
+    void gc_loop();
+
+    mutable std::shared_mutex                           map_mu;
+    std::unordered_map<std::string, stream_session_ptr> sessions; // key: conversation_id
+    std::thread                                         gc_thread;
+    std::atomic<bool>                                   running;
+    std::mutex                                          gc_wake_mu;
+    std::condition_variable                             gc_wake_cv;
+};
+
+// process wide manager, linked by both llama-server and llama-cli. llama-server main() drives
+// start_gc/stop_gc, llama-cli leaves it idle. the dtor calls stop_gc() unconditionally so exit
+// is safe whether or not the GC thread ran
+extern stream_session_manager g_stream_sessions;
+
+// route handler factories operating on g_stream_sessions, wired under /v1/stream/* by server.cpp.
+// keeps the resumable stream surface confined to server-stream
+server_http_context::handler_t make_stream_get_handler();
+server_http_context::handler_t make_streams_lookup_handler();
+server_http_context::handler_t make_stream_delete_handler();
+
+// extract the X-Conversation-Id header value (case-insensitive), empty when absent. exposed so
+// the router can track which child serves a forwarded POST
+std::string stream_conv_id_from_headers(const std::map<std::string, std::string> & headers);
+
+// on an X-Conversation-Id header, create or replace the session and attach a producer pipe to
+// res. no-op when absent, called from the server_res_generator constructor
+void stream_session_attach_pipe(server_http_res & res, const std::map<std::string, std::string> & headers);
+
+// should_stop closure that ignores peer disconnect when a pipe is attached, so only an explicit
+// DELETE stops the producer and generation keeps flowing into the ring buffer. without a pipe it
+// delegates to fallback, the legacy non-resumable flow
+std::function<bool()> stream_aware_should_stop(server_http_res * res, std::function<bool()> fallback);
@@ -1626,7 +1626,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t
        const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens);

        if (cur_lcp_len == (int) prompt.tokens.size()) {
-            SRV_INF("%s", " - prompt is already in the cache, skipping\n");
+            SRV_TRC("%s", " - prompt is already in the cache, skipping\n");
            return nullptr;
        }
    }
@@ -1636,7 +1636,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t
        const int len = it->tokens.get_common_prefix(prompt.tokens);

        if (len == (int) it->tokens.size()) {
-            SRV_WRN(" - removing obsolete cached prompt with length %d\n", len);
+            SRV_TRC(" - removing obsolete cached prompt with length %d\n", len);

            it = states.erase(it);
        } else {
@@ -1681,7 +1681,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok
    float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins
    float sim_best    = float(lcp_best) / tokens_new.size();

-    SRV_INF(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
+    SRV_TRC(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);

    auto it_best = states.end();

@@ -1706,7 +1706,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok
    }

    if (it_best != states.end()) {
-        SRV_INF(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
+        SRV_TRC(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);

        {
            auto & data = it_best->data.main;
@@ -1783,11 +1783,11 @@ void server_prompt_cache::update() {
        }
    }

-    SRV_INF(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
+    SRV_TRC(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
            states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);

    for (const auto & state : states) {
-        SRV_INF("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
+        SRV_TRC("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
                (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
    }
 }
@@ -2,6 +2,7 @@
 #include "server-http.h"
 #include "server-models.h"
 #include "server-cors-proxy.h"
+#include "server-stream.h"
 #include "server-tools.h"

 #include "arg.h"
@@ -82,6 +83,10 @@ int llama_server(int argc, char ** argv) {

    common_init();

+    // start the stream session manager GC right after common init, before any HTTP route can
+    // touch it. lifecycle is symmetric, stop_gc() runs in clean_up() before backend free
+    g_stream_sessions.start_gc();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
        return 1;
    }
@@ -119,7 +124,7 @@ int llama_server(int argc, char ** argv) {
        }

        if (params.n_parallel < 0) {
-            SRV_INF("%s", "n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n");
+            SRV_TRC("%s", "n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n");

            params.n_parallel = 4;
            params.kv_unified = true;
@@ -239,6 +244,29 @@ int llama_server(int argc, char ** argv) {
    ctx_http.get ("/slots",                    ex_wrapper(routes.get_slots));
    ctx_http.post("/slots/:id_slot",           ex_wrapper(routes.post_slots));

+    // resumable streaming, the conversation_id is the session identity end to end. router and
+    // child wire different handlers under the same paths: a child binds the local g_stream_sessions
+    // backed factories, the router binds proxies that resolve the owning child through the
+    // conv_id -> model map
+    server_http_context::handler_t stream_get_h;
+    server_http_context::handler_t streams_lookup_h;
+    server_http_context::handler_t stream_delete_h;
+    if (is_router_server) {
+        stream_get_h     = models_routes->router_stream_get;
+        streams_lookup_h = models_routes->router_streams_lookup;
+        stream_delete_h  = models_routes->router_stream_delete;
+    } else {
+        stream_get_h     = make_stream_get_handler();
+        streams_lookup_h = make_streams_lookup_handler();
+        stream_delete_h  = make_stream_delete_handler();
+    }
+    ctx_http.get ("/v1/stream/:conv_id",       ex_wrapper(stream_get_h));
+    // POST /v1/streams/lookup with body {"conversation_ids": [...]}. you can only ask for ids
+    // you already own (the WebUI passes the convs visible in its sidebar). the server never
+    // lists ids it has not been asked about, so a random caller cannot enumerate live sessions
+    ctx_http.post("/v1/streams/lookup",        ex_wrapper(streams_lookup_h));
+    ctx_http.del ("/v1/stream/:conv_id",       ex_wrapper(stream_delete_h));
+
    // Google Cloud Platform (Vertex AI) compat
    ctx_http.register_gcp_compat();

@@ -310,10 +338,12 @@ int llama_server(int argc, char ** argv) {
    std::function<void()> clean_up;

    if (is_router_server) {
-        SRV_INF("%s", "starting router server, no model will be loaded in this process\n");
+        SRV_INF("%s", "starting server in router mode. models will be automatically loaded on-demand\n");

        clean_up = [&models_routes]() {
            SRV_INF("%s: cleaning up before exit...\n", __func__);
+            // stop the session GC first, it finalizes live sessions and wakes pending readers
+            g_stream_sessions.stop_gc();
            if (models_routes.has_value()) {
                models_routes->stopping.store(true); // maybe redundant, but just to be safe
                models_routes->models.unload_all();
@@ -340,6 +370,8 @@ int llama_server(int argc, char ** argv) {
        // setup clean up function, to be called before exit
        clean_up = [&ctx_http, &ctx_server]() {
            SRV_INF("%s: cleaning up before exit...\n", __func__);
+            // stop the session GC first, it finalizes live sessions and wakes pending readers
+            g_stream_sessions.stop_gc();
            ctx_http.stop();
            ctx_server.terminate();
            llama_backend_free();
@@ -359,9 +391,6 @@ int llama_server(int argc, char ** argv) {
            });
        }

-        // load the model
-        SRV_INF("%s", "loading model\n");
-
        if (!ctx_server.load_model(params)) {
            clean_up();
            if (ctx_http.thread.joinable()) {
@@ -397,8 +426,9 @@ int llama_server(int argc, char ** argv) {
    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif

+    SRV_INF("listening on %s\n", ctx_http.listening_address.c_str());
+
    if (is_router_server) {
-        SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());
        SRV_WRN("%s", "NOTE: router mode is experimental\n");
        SRV_WRN("%s", "      it is not recommended to use this mode in untrusted environments\n");

@@ -414,8 +444,6 @@ int llama_server(int argc, char ** argv) {
        // when the HTTP server stops, clean up and exit
        clean_up();
    } else {
-        SRV_INF("server is listening on %s\n", ctx_http.listening_address.c_str());
-
        // optionally, notify router server that this instance is ready
        std::thread monitor_thread;
        if (child.is_child()) {
@@ -33,7 +33,7 @@

 	{#if !readonly && onRemove}
 		<div
-			class="absolute top-10 right-2 flex items-center justify-center opacity-0 transition-opacity group-hover:opacity-100"
+			class="absolute top-10 right-2 flex items-center justify-center opacity-0 transition-opacity group-focus-within:opacity-100 group-hover:opacity-100"
 		>
 			<ActionIcon icon={X} tooltip="Remove" stopPropagationOnClick onclick={() => onRemove?.()} />
 		</div>
@@ -56,7 +56,7 @@
 			<div class="relative flex h-6 items-center justify-between">
 				<div class="right-0 flex items-center gap-2 opacity-100 transition-opacity">
 					<div
-						class="pointer-events-auto inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-hover:opacity-100"
+						class="pointer-events-auto inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-focus-within:opacity-100 group-hover:opacity-100"
 					>
 						<ActionIcon icon={Edit} tooltip="Edit" onclick={editCtx.handleEdit} />
 						<ActionIcon icon={Trash2} tooltip="Delete" onclick={onDelete} />
@@ -5,6 +5,7 @@
 		ChatMessages,
 		ChatScreenDragOverlay,
 		ChatScreenProcessingInfo,
+		ChatScreenStreamResumeStatus,
 		ServerLoadingSplash,
 		ChatScreenServerError
 	} from '$lib/components/app';
@@ -281,6 +282,10 @@

 			<ChatScreenServerError />

+			{#if page.params.id}
+				<ChatScreenStreamResumeStatus />
+			{/if}
+
 			<div class="pointer-events-none flex flex-col gap-6 items-center w-full">
 				{#if (isMobile.current ? mobileScrollDownHint || isMobileUserScrolledUp : autoScroll.userScrolledUp) && page.url.hash.includes(ROUTES.CHAT) && page.params.id}
 					<ChatScreenActionScrollDown
@@ -0,0 +1,18 @@
+<script lang="ts">
+	import { chatStore } from '$lib/stores/chat.svelte';
+	import { StreamConnectionState } from '$lib/enums';
+	import { Loader2 } from '@lucide/svelte';
+
+	let state = $derived(chatStore.streamConnectionState);
+</script>
+
+{#if state === StreamConnectionState.RESUMING}
+	<div
+		class="pointer-events-auto mx-auto mt-2 mb-2 flex max-w-[48rem] items-center gap-2 rounded-md border border-blue-400/40 bg-blue-50/60 px-3 py-1.5 text-sm text-blue-700 dark:bg-blue-950/40 dark:text-blue-200"
+		role="status"
+		aria-live="polite"
+	>
+		<Loader2 class="h-3.5 w-3.5 animate-spin" />
+		<span>Reconnecting to the stream...</span>
+	</div>
+{/if}
@@ -683,3 +683,11 @@ export { default as ChatScreenProcessingInfo } from './ChatScreen/ChatScreenProc
 * Rendered inside ChatScreen when `serverError` store has a value.
 */
 export { default as ChatScreenServerError } from './ChatScreen/ChatScreenServerError.svelte';
+
+/**
+ * Stream resume status indicator. Shows a small "Reconnecting to the stream..."
+ * banner with a spinner while `chatStore.streamConnectionState` is `resuming`,
+ * i.e. after a dropped connection is reattaching to the live SSE replay buffer.
+ * Renders nothing otherwise. Shown inside ChatScreen only on an active conversation route.
+ */
+export { default as ChatScreenStreamResumeStatus } from './ChatScreen/ChatScreenStreamResumeStatus.svelte';
@@ -39,7 +39,6 @@
 		depth = 0
 	}: Props = $props();

-	let renderActionsDropdown = $state(false);
 	let dropdownOpen = $state(false);

 	let isLoading = $derived(getAllLoadingChats().includes(conversation.id));
@@ -71,26 +70,10 @@
 		}
 	}

-	function handleMouseLeave() {
-		if (!dropdownOpen) {
-			renderActionsDropdown = false;
-		}
-	}
-
-	function handleMouseOver() {
-		renderActionsDropdown = true;
-	}
-
 	function handleSelect() {
 		onSelect?.(conversation.id);
 	}

-	$effect(() => {
-		if (!dropdownOpen) {
-			renderActionsDropdown = false;
-		}
-	});
-
 	onMount(() => {
 		document.addEventListener('edit-active-conversation', handleGlobalEditEvent as EventListener);

@@ -103,23 +86,19 @@
 	});
 </script>

-<!-- svelte-ignore a11y_mouse_events_have_key_events -->
-<button
-	class="group flex min-h-9 w-full cursor-pointer items-center justify-between space-x-3 rounded-lg py-1.5 text-left transition-colors hover:bg-foreground/10 {isActive
+<div
+	class="conversation-item group relative flex min-h-9 w-full items-center justify-between space-x-3 rounded-lg py-1.5 transition-colors hover:bg-foreground/10 {isActive
 		? 'bg-foreground/5 text-accent-foreground'
 		: ''} px-3"
-	onclick={handleSelect}
-	onmouseover={handleMouseOver}
-	onmouseleave={handleMouseLeave}
-	onfocusin={handleMouseOver}
-	onfocusout={(e) => {
-		if (!e.currentTarget.contains(e.relatedTarget as Node | null)) {
-			handleMouseLeave();
-		}
-	}}
 >
+	<button
+		class="absolute inset-0 z-0 cursor-pointer rounded-lg focus:outline-none focus-visible:ring-2 focus-visible:ring-ring"
+		onclick={handleSelect}
+		aria-label={conversation.name}
+	>
+	</button>
 	<div
-		class="flex min-w-0 flex-1 items-center gap-2"
+		class="pointer-events-none relative z-10 flex min-w-0 flex-1 items-center gap-2"
 		style:padding-left="{depth * FORK_TREE_DEPTH_PADDING}px"
 	>
 		{#if depth > 0}
@@ -130,7 +109,7 @@
 						<a
 							{...props}
 							href={RouterService.chat(conversation.forkedFromConversationId)}
-							class="flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
+							class="pointer-events-auto flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
 						>
 							<GitBranch class="h-3.5 w-3.5" />
 						</a>
@@ -146,18 +125,15 @@
 		{#if isLoading}
 			<Tooltip.Root>
 				<Tooltip.Trigger>
-					<div
-						class="stop-button flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
+					<button
+						class="stop-button pointer-events-auto flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
 						onclick={handleStop}
-						onkeydown={(e) => e.key === 'Enter' && handleStop(e)}
-						role="button"
-						tabindex="0"
 						aria-label="Stop generation"
 					>
 						<Loader2 class="loading-icon h-3.5 w-3.5 animate-spin" />

 						<Square class="stop-icon hidden h-3 w-3 fill-current text-destructive" />
-					</div>
+					</button>
 				</Tooltip.Trigger>

 				<Tooltip.Content>
@@ -169,52 +145,50 @@
 		<TruncatedText text={conversation.name} class="text-sm font-medium" showTooltip={false} />
 	</div>

-	{#if renderActionsDropdown}
-		<div class="actions flex items-center">
-			<DropdownMenuActions
-				triggerIcon={MoreHorizontal}
-				triggerTooltip="More actions"
-				bind:open={dropdownOpen}
-				actions={[
-					{
-						icon: conversation.pinned ? PinOff : Pin,
-						label: conversation.pinned ? 'Unpin' : 'Pin',
-						onclick: (e: Event) => {
-							e.stopPropagation();
-							handleTogglePin();
-						}
-					},
-					{
-						icon: Pencil,
-						label: 'Edit',
-						onclick: handleEdit,
-						shortcut: ['shift', 'cmd', 'e']
-					},
-					{
-						icon: Download,
-						label: 'Export',
-						onclick: (e: Event) => {
-							e.stopPropagation();
-							conversationsStore.downloadConversation(conversation.id);
-						},
-						shortcut: ['shift', 'cmd', 's']
-					},
-					{
-						icon: Trash2,
-						label: 'Delete',
-						onclick: handleDelete,
-						variant: 'destructive',
-						shortcut: ['shift', 'cmd', 'd'],
-						separator: true
+	<div class="actions pointer-events-auto relative z-20 flex items-center">
+		<DropdownMenuActions
+			triggerIcon={MoreHorizontal}
+			triggerTooltip="More actions"
+			bind:open={dropdownOpen}
+			actions={[
+				{
+					icon: conversation.pinned ? PinOff : Pin,
+					label: conversation.pinned ? 'Unpin' : 'Pin',
+					onclick: (e: Event) => {
+						e.stopPropagation();
+						handleTogglePin();
 					}
-				]}
-			/>
-		</div>
-	{/if}
-</button>
+				},
+				{
+					icon: Pencil,
+					label: 'Edit',
+					onclick: handleEdit,
+					shortcut: ['shift', 'cmd', 'e']
+				},
+				{
+					icon: Download,
+					label: 'Export',
+					onclick: (e: Event) => {
+						e.stopPropagation();
+						conversationsStore.downloadConversation(conversation.id);
+					},
+					shortcut: ['shift', 'cmd', 's']
+				},
+				{
+					icon: Trash2,
+					label: 'Delete',
+					onclick: handleDelete,
+					variant: 'destructive',
+					shortcut: ['shift', 'cmd', 'd'],
+					separator: true
+				}
+			]}
+		/>
+	</div>
+</div>

 <style>
-	button {
+	.conversation-item {
 		:global([data-slot='dropdown-menu-trigger']:not([data-state='open'])) {
 			opacity: 0;
 		}
@@ -239,7 +213,8 @@
 			}
 		}

-		&:is(:hover) .stop-button {
+		&:is(:hover) .stop-button,
+		&:focus-within .stop-button {
 			:global(.stop-icon) {
 				display: block;
 			}
@@ -21,5 +21,11 @@ export const API_TOOLS = {
 	EXECUTE: '/tools'
 };

+// resumable stream routes, the conv::model identity is appended as a path segment
+export const API_STREAM = {
+	BASE: './v1/stream',
+	LOOKUP: './v1/streams/lookup'
+};
+
 /** CORS proxy endpoint path */
 export const CORS_PROXY_ENDPOINT = '/cors-proxy';
@@ -46,6 +46,7 @@ export * from './routes';
 export * from './sandbox';
 export * from './settings-keys';
 export * from './settings-registry';
+export * from './stream';
 export * from './supported-file-types';
 export * from './table-html-restorer';
 export * from './title-generation';
@@ -26,6 +26,9 @@ export const THINKING_ENABLED_DEFAULT_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME}.th
 export const REASONING_EFFORT_DEFAULT_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME}.reasoningEffortDefault`;
 export const USER_OVERRIDES_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME}.userOverrides`;

+/** Key prefix for per-conversation resumable stream state, conversationId is appended */
+export const STREAM_RESUME_LOCALSTORAGE_KEY_PREFIX = `${STORAGE_APP_NAME}.streamResume.`;
+
 // Deprecated old key names (kept for backward compat while users migrate)
 /** @deprecated Use {@link ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY} instead */
 export const DEPRECATED_ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME_DEPRECATED}.alwaysAllowedTools`;
@@ -0,0 +1,3 @@
+// grace window after a visibilitychange before we kick a reader whose socket likely died
+// while the tab was hidden. covers brief background pauses without thrashing live streams
+export const STREAM_VISIBILITY_KICK_MS = 1000;
@@ -5,6 +5,15 @@ export enum ChatMessageStatsView {
 	SUMMARY = 'summary'
 }

+/**
+ * Connection state of a streamed completion, drives the resume status indicator.
+ */
+export enum StreamConnectionState {
+	STREAMING = 'streaming',
+	RESUMING = 'resuming',
+	LOST = 'lost'
+}
+
 /**
 * Reasoning format options for API requests.
 */
@@ -10,6 +10,7 @@ export { AgenticSectionType, ContinueIntentKind, ToolCallType } from './agentic.

 export {
 	ChatMessageStatsView,
+	StreamConnectionState,
 	ContentPartType,
 	ConversationSelectionMode,
 	ErrorDialogType,
@@ -1,6 +1,7 @@
-import { getJsonHeaders } from '$lib/utils/api-headers';
+import { getAuthHeaders, getJsonHeaders } from '$lib/utils/api-headers';
 import { formatAttachmentText } from '$lib/utils/formatters';
 import { isAbortError } from '$lib/utils/abort';
+import { streamIdentity } from '$lib/utils/stream-identity';
 import {
 	ATTACHMENT_LABEL_PDF_FILE,
 	ATTACHMENT_LABEL_MCP_PROMPT,
@@ -13,7 +14,10 @@ import {
 	CONTROL_ACTION,
 	SSE_LINE_SEPARATOR,
 	SSE_DATA_PREFIX,
-	SSE_DONE_MARKER
+	SSE_DONE_MARKER,
+	STREAM_VISIBILITY_KICK_MS,
+	STREAM_RESUME_LOCALSTORAGE_KEY_PREFIX,
+	API_STREAM
 } from '$lib/constants';
 import {
 	AttachmentType,
@@ -21,12 +25,14 @@ import {
 	FileTypeAudio,
 	MessageRole,
 	MimeTypeAudio,
-	ReasoningFormat
+	ReasoningFormat,
+	StreamConnectionState
 } from '$lib/enums';
 import type {
 	ApiChatMessageContentPart,
 	ApiChatMessageData,
-	ApiChatCompletionToolCall
+	ApiChatCompletionToolCall,
+	ApiStreamSession
 } from '$lib/types/api';
 import type {
 	AudioInputFormat,
@@ -54,6 +60,19 @@ function getAudioInputFormat(mimeType: string): AudioInputFormat {
 	return FileTypeAudio.MP3;
 }

+interface ResumableStreamState {
+	bytesReceived: number;
+	updatedAt: number;
+
+	// model frozen at POST time, lets a reload rebuild the exact conv::model identity the
+	// server keyed the session under. null when the POST carried no explicit model
+	model?: string | null;
+}
+
+function streamStorageKey(conversationId: string): string {
+	return STREAM_RESUME_LOCALSTORAGE_KEY_PREFIX + conversationId;
+}
+
 export class ChatService {
 	/**
 	 *
@@ -128,6 +147,7 @@ export class ChatService {
 			onChunk,
 			onComplete,
 			onError,
+			onConnectionState,
 			onReasoningChunk,
 			onToolCallChunk,
 			onModel,
@@ -312,9 +332,16 @@ export class ChatService {
 		}

 		try {
+			const headers: Record<string, string> = { ...getJsonHeaders() };
+			// tag streaming requests with the conversation id, this single header is the opt in for the
+			// server side replay buffer and powers discoverActiveStream on tab reopen. with an explicit
+			// model the ::model suffix keeps the per model session distinct
+			if (stream && conversationId) {
+				headers['X-Conversation-Id'] = streamIdentity(conversationId, options.model);
+			}
 			const response = await fetch(API_CHAT.COMPLETIONS, {
 				method: 'POST',
-				headers: getJsonHeaders(),
+				headers,
 				body: JSON.stringify(requestBody),
 				signal
 			});
@@ -341,7 +368,9 @@ export class ChatService {
 					onCompletionId,
 					onTimings,
 					conversationId,
-					signal
+					signal,
+					onConnectionState,
+					options.model
 				);

 				return;
@@ -473,6 +502,116 @@ export class ChatService {
 	 * @param excludeReasoning - Whether to strip reasoning content (should match excludeReasoningFromContext setting)
 	 * @param signal - Optional AbortSignal to cancel the pre-encode request
 	 */
+	static async cancelServerStream(conversationId: string, model?: string | null): Promise<void> {
+		if (!conversationId) return;
+		try {
+			const id = streamIdentity(conversationId, model);
+			await fetch(`${API_STREAM.BASE}/${encodeURIComponent(id)}`, {
+				method: 'DELETE',
+				headers: getAuthHeaders()
+			});
+		} catch (e) {
+			console.warn('cancelServerStream failed:', e);
+		}
+	}
+
+	/**
+	 * Pick the running session to splice into when discoverActiveStream lists candidates for a
+	 * conversation. Finalized sessions are not candidates: their final content was already written
+	 * to the DB by the original onComplete handler, so attaching to them would replay a buffer that
+	 * may not match what the DB holds. A continue session's buffer holds only the appended deltas,
+	 * not the pre continue prefix, so replaying it as a fresh generation would erase the original.
+	 *
+	 * Among running sessions we tie break on the most recent started_at, which covers the case of
+	 * multiple inferences left running on the same conversation.
+	 */
+	static selectActiveStream(
+		sessions: ApiStreamSession[] | null | undefined
+	): ApiStreamSession | null {
+		if (!Array.isArray(sessions) || sessions.length === 0) {
+			return null;
+		}
+		const running = sessions.filter((s) => !s.is_done);
+		if (running.length === 0) {
+			return null;
+		}
+		return running.reduce((best, cur) => (cur.started_at > best.started_at ? cur : best));
+	}
+
+	// persist the running byte count and the frozen model for a conversation, a later visit
+	// resumes the SSE replay at the right offset under the same conv::model identity
+	static saveStreamState(
+		conversationId: string,
+		bytesReceived: number,
+		model?: string | null
+	): void {
+		if (!conversationId) return;
+		try {
+			const state: ResumableStreamState = {
+				bytesReceived,
+				updatedAt: Date.now(),
+				model: model ?? null
+			};
+			localStorage.setItem(streamStorageKey(conversationId), JSON.stringify(state));
+		} catch {
+			// localStorage may be full or disabled, silently ignore
+		}
+	}
+
+	static getStreamState(conversationId: string): ResumableStreamState | null {
+		if (!conversationId) return null;
+		try {
+			const raw = localStorage.getItem(streamStorageKey(conversationId));
+			if (!raw) return null;
+			const parsed = JSON.parse(raw) as ResumableStreamState;
+			if (!parsed || typeof parsed.bytesReceived !== 'number') return null;
+			return parsed;
+		} catch {
+			return null;
+		}
+	}
+
+	static clearStreamState(conversationId: string): void {
+		if (!conversationId) return;
+		try {
+			localStorage.removeItem(streamStorageKey(conversationId));
+		} catch {
+			// nothing to do
+		}
+	}
+
+	/**
+	 * Rebuild the stream identity for a resume. The model persisted at POST time wins, including a
+	 * stored null which means the POST carried no explicit model so the identity stays the bare conv
+	 * id. Only fall back to the caller supplied current model when nothing was persisted.
+	 */
+	static resumeStreamIdentity(
+		conversationId: string,
+		state: ResumableStreamState | null,
+		fallbackModel: string | null
+	): string {
+		const model = state && state.model !== undefined ? state.model : fallbackModel;
+		return streamIdentity(conversationId, model);
+	}
+
+	/**
+	 * Reconnect to an interrupted stream for this conversation. Returns the fetch Response so the
+	 * existing SSE parser drains it like a fresh stream. The server returns 200 on success, 404 if
+	 * no session exists for the conv_id, and 400 if the offset is below the dropped prefix.
+	 */
+	static async resumeStream(
+		conversationId: string,
+		signal?: AbortSignal,
+		model?: string | null
+	): Promise<Response | null> {
+		if (!conversationId) return null;
+		const state = ChatService.getStreamState(conversationId);
+		const from = state?.bytesReceived ?? 0;
+		const id = streamIdentity(conversationId, model);
+		const url = `${API_STREAM.BASE}/${encodeURIComponent(id)}?from=${from}`;
+		return await fetch(url, { method: 'GET', signal, headers: getAuthHeaders() });
+	}
+
 	static async preEncode(
 		messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
 		model?: string | null,
@@ -557,7 +696,7 @@ export class ChatService {
 	 * @returns {Promise<void>} Promise that resolves when streaming is complete
 	 * @throws {Error} if the stream cannot be read or parsed
 	 */
-	private static async handleStreamResponse(
+	static async handleStreamResponse(
 		response: Response,
 		onChunk?: (chunk: string) => void,
 		onComplete?: (
@@ -573,15 +712,34 @@ export class ChatService {
 		onCompletionId?: (id: string) => void,
 		onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
 		conversationId?: string,
-		abortSignal?: AbortSignal
+		abortSignal?: AbortSignal,
+		onConnectionState?: (state: StreamConnectionState) => void,
+		streamModel?: string | null
 	): Promise<void> {
-		const reader = response.body?.getReader();
+		let reader = response.body?.getReader();

 		if (!reader) {
 			throw new Error('No response body');
 		}

-		const decoder = new TextDecoder();
+		// bytesParsed is the absolute server side buffer offset of the next byte to parse
+		// segmentStartOffset is the absolute offset where the current reader started, reset on resume
+		// segmentBytesRead is wire bytes read by the current reader
+		let bytesParsed = 0;
+		let segmentStartOffset = 0;
+		let segmentBytesRead = 0;
+		let lastByteAt = Date.now();
+		// each resume must produce at least one byte to be retried again
+		// if a resume returns 200 but yields nothing, we abandon
+		// since the session has a bounded size, the total number of retries is bounded by construction
+		let madeProgress = true;
+		const encoder = new TextEncoder();
+		if (conversationId) {
+			ChatService.saveStreamState(conversationId, 0, streamModel);
+		}
+		onConnectionState?.(StreamConnectionState.STREAMING);
+
+		let decoder = new TextDecoder();
 		let aggregatedContent = '';
 		let fullReasoningContent = '';
 		let aggregatedToolCalls: ApiChatCompletionToolCall[] = [];
@@ -633,84 +791,180 @@ export class ChatService {
 			}
 		};

+		const onVisibilityChange = () => {
+			if (typeof document === 'undefined') return;
+			if (document.visibilityState !== 'visible') return;
+			if (streamFinished) return;
+			if (!conversationId) return;
+			// the bytes have been quiet for too long, the OS likely killed the socket
+			// kicking the reader unblocks reader.read with done=true so the outer loop can resume
+			if (Date.now() - lastByteAt > STREAM_VISIBILITY_KICK_MS) {
+				reader!.cancel().catch(() => {});
+			}
+		};
+		if (typeof document !== 'undefined') {
+			document.addEventListener('visibilitychange', onVisibilityChange);
+		}
+
 		try {
 			let chunk = '';
+			// outer loop drives the resume cycle, swaps reader on premature end of stream
 			while (true) {
-				if (abortSignal?.aborted) break;
-
-				const { done, value } = await reader.read();
-				if (done) break;
-
-				if (abortSignal?.aborted) break;
-
-				chunk += decoder.decode(value, { stream: true });
-				const lines = chunk.split(SSE_LINE_SEPARATOR);
-				chunk = lines.pop() || '';
-
-				for (const line of lines) {
+				while (true) {
 					if (abortSignal?.aborted) break;

-					if (line.startsWith(SSE_DATA_PREFIX)) {
-						const data = line.slice(SSE_DATA_PREFIX.length).trim();
-						if (data === SSE_DONE_MARKER) {
-							streamFinished = true;
-
-							continue;
+					let done: boolean;
+					let value: Uint8Array | undefined;
+					try {
+						const r = await reader.read();
+						done = r.done;
+						value = r.value;
+					} catch (readErr) {
+						// reader.read() rejects with TypeError when the underlying connection drops
+						// instead of just resolving with done=true. treat it like done so the outer
+						// loop swaps reader via the resume path
+						if (isAbortError(readErr)) {
+							throw readErr;
 						}
+						console.warn('reader.read() rejected, treating as premature end:', readErr);
+						done = true;
+						value = undefined;
+					}
+					if (done) break;

-						try {
-							const parsed: ApiChatCompletionStreamChunk = JSON.parse(data);
-							const choice = parsed.choices?.[0];
-							const content = choice?.delta?.content;
-							const reasoningContent = choice?.delta?.reasoning_content;
-							const toolCalls = choice?.delta?.tool_calls;
-							const timings = parsed.timings;
-							const promptProgress = parsed.prompt_progress;
+					if (abortSignal?.aborted) break;

-							const chunkModel = ChatService.extractModelName(parsed);
-							if (chunkModel && !modelEmitted) {
-								modelEmitted = true;
-								onModel?.(chunkModel);
-							}
-
-							if (parsed.id && !idEmitted) {
-								idEmitted = true;
-								onCompletionId?.(parsed.id);
-							}
-
-							if (promptProgress) {
-								ChatService.notifyTimings(undefined, promptProgress, onTimings);
-							}
-
-							if (timings) {
-								ChatService.notifyTimings(timings, promptProgress, onTimings);
-								lastTimings = timings;
-							}
-
-							if (content) {
-								finalizeOpenToolCallBatch();
-								aggregatedContent += content;
-								if (!abortSignal?.aborted) {
-									onChunk?.(content);
-								}
-							}
-
-							if (reasoningContent) {
-								finalizeOpenToolCallBatch();
-								fullReasoningContent += reasoningContent;
-								if (!abortSignal?.aborted) {
-									onReasoningChunk?.(reasoningContent);
-								}
-							}
-
-							processToolCallDelta(toolCalls);
-						} catch (e) {
-							console.error('Error parsing JSON chunk:', e);
+					if (value && value.byteLength > 0) {
+						segmentBytesRead += value.byteLength;
+						lastByteAt = Date.now();
+						if (!madeProgress) {
+							madeProgress = true;
+							onConnectionState?.(StreamConnectionState.STREAMING);
 						}
 					}
+
+					chunk += decoder.decode(value, { stream: true });
+					const lines = chunk.split(SSE_LINE_SEPARATOR);
+					chunk = lines.pop() || '';
+
+					// the persisted offset must point right after the last fully parsed line,
+					// the trailing `chunk` is partial bytes still waiting for a newline
+					if (conversationId) {
+						const tailBytes = encoder.encode(chunk).byteLength;
+						bytesParsed = segmentStartOffset + segmentBytesRead - tailBytes;
+						ChatService.saveStreamState(conversationId, bytesParsed, streamModel);
+					}
+
+					for (const line of lines) {
+						if (abortSignal?.aborted) break;
+
+						if (line.startsWith(SSE_DATA_PREFIX)) {
+							const data = line.slice(SSE_DATA_PREFIX.length).trim();
+							if (data === SSE_DONE_MARKER) {
+								streamFinished = true;
+
+								continue;
+							}
+
+							try {
+								const parsed: ApiChatCompletionStreamChunk = JSON.parse(data);
+								const choice = parsed.choices?.[0];
+								const content = choice?.delta?.content;
+								const reasoningContent = choice?.delta?.reasoning_content;
+								const toolCalls = choice?.delta?.tool_calls;
+								const timings = parsed.timings;
+								const promptProgress = parsed.prompt_progress;
+
+								const chunkModel = ChatService.extractModelName(parsed);
+								if (chunkModel && !modelEmitted) {
+									modelEmitted = true;
+									onModel?.(chunkModel);
+								}
+
+								if (parsed.id && !idEmitted) {
+									idEmitted = true;
+									onCompletionId?.(parsed.id);
+								}
+
+								if (promptProgress) {
+									ChatService.notifyTimings(undefined, promptProgress, onTimings);
+								}
+
+								if (timings) {
+									ChatService.notifyTimings(timings, promptProgress, onTimings);
+									lastTimings = timings;
+								}
+
+								if (content) {
+									finalizeOpenToolCallBatch();
+									aggregatedContent += content;
+									if (!abortSignal?.aborted) {
+										onChunk?.(content);
+									}
+								}
+
+								if (reasoningContent) {
+									finalizeOpenToolCallBatch();
+									fullReasoningContent += reasoningContent;
+									if (!abortSignal?.aborted) {
+										onReasoningChunk?.(reasoningContent);
+									}
+								}
+
+								processToolCallDelta(toolCalls);
+							} catch (e) {
+								console.error('Error parsing JSON chunk:', e);
+							}
+						}
+					}
+
+					if (abortSignal?.aborted) break;
+					if (streamFinished) break;
 				}

+				// inner reader done, decide whether to try a resume
 				if (abortSignal?.aborted) break;
+				if (streamFinished) break;
+				if (!conversationId) break;
+
+				if (!madeProgress) {
+					onConnectionState?.(StreamConnectionState.LOST);
+					onError?.(new Error('Stream resume produced no new bytes, giving up'));
+					break;
+				}
+
+				onConnectionState?.(StreamConnectionState.RESUMING);
+				madeProgress = false;
+
+				// the server resends starting at bytesParsed, discard any partial line we held, it
+				// will be retransmitted from a clean line boundary. reuse the frozen model, not the
+				// live dropdown
+				const resumeResp = await ChatService.resumeStream(
+					conversationId,
+					abortSignal,
+					streamModel
+				).catch(() => null);
+				// an abort landing during the resume request is intentional, not a lost connection
+				if (abortSignal?.aborted) break;
+				if (!resumeResp || resumeResp.status !== 200) {
+					onConnectionState?.(StreamConnectionState.LOST);
+					onError?.(new Error('Stream connection lost and could not be resumed'));
+					break;
+				}
+				const newReader = resumeResp.body?.getReader();
+				if (!newReader) break;
+
+				try {
+					reader.releaseLock();
+				} catch {
+					/* ignore */
+				}
+				reader = newReader;
+				decoder = new TextDecoder();
+				chunk = '';
+				segmentStartOffset = bytesParsed;
+				segmentBytesRead = 0;
+				lastByteAt = Date.now();
 			}

 			if (abortSignal?.aborted) return;
@@ -718,6 +972,10 @@ export class ChatService {
 			if (streamFinished) {
 				finalizeOpenToolCallBatch();

+				if (conversationId) {
+					ChatService.clearStreamState(conversationId);
+				}
+
 				const finalToolCalls =
 					aggregatedToolCalls.length > 0 ? JSON.stringify(aggregatedToolCalls) : undefined;

@@ -735,7 +993,14 @@ export class ChatService {

 			throw err;
 		} finally {
-			reader.releaseLock();
+			if (typeof document !== 'undefined') {
+				document.removeEventListener('visibilitychange', onVisibilityChange);
+			}
+			try {
+				reader.releaseLock();
+			} catch {
+				/* ignore */
+			}
 		}
 	}

@@ -628,19 +628,20 @@ export class MCPService {
 		);

 		const runtimeErrorHandler = (error: Error) => {
-			// Ignore errors that are expected when the SDK's transport is closed,
-			// or when connecting to servers that don't support SSE (stateless-only
-			// endpoints returning 405). The SDK wraps the original AbortError in
-			// a new Error with the message "SSE stream disconnected: AbortError",
-			// and also produces "Cannot cancel a stream locked by a reader".
-			// DOMException is thrown by the browser when aborting fetch requests.
-			const msg = error.message || String(error);
+			// the SDK reports any post initialize error here, including the abort we trigger
+			// ourselves on the next health check cycle, on tab unload, or on server teardown.
+			// these are lifecycle aborts, not actionable errors, so we keep them out of the red console.
+			// the SDK wraps the original AbortError in a generic Error like
+			//   "SSE stream disconnected: AbortError: The operation was aborted."
+			// which isAbortError cannot recognize by name alone, so we also pattern match on the message
+			if (isAbortError(error)) {
+				return;
+			}
+			const msg = error?.message ?? '';
 			if (
-				error.name === 'AbortError' ||
-				error instanceof DOMException ||
-				msg.includes('SSE stream disconnected') ||
-				msg.includes('stream locked by a reader') ||
-				msg.includes('The operation was aborted')
+				/SSE stream disconnected:.*AbortError/i.test(msg) ||
+				/AbortError: .*aborted/i.test(msg) ||
+				/stream locked by a reader/i.test(msg)
 			) {
 				return;
 			}
--- a/Show More
+++ b/Show More