cont : remove /api/tags

server : remove /api endpoints
2026-05-20 16:35:58 +02:00 · 2026-04-20 15:45:42 +03:00 · 2026-04-20 15:34:18 +03:00
1319 changed files with 72730 additions and 124847 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -5,9 +5,6 @@
 # Define the CANN base image for easier version updates later
 ARG CHIP_TYPE=910b
 ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A

 # ==============================================================================
 # BUILD STAGE
@@ -70,19 +67,6 @@ RUN mkdir -p /app/full && \
 # ==============================================================================
 FROM ${CANN_BASE_IMAGE} AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 # -- Install runtime dependencies --
 RUN yum install -y libgomp curl && \
    yum clean all && \
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -1,7 +1,4 @@
 ARG UBUNTU_VERSION=24.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A

 FROM ubuntu:$UBUNTU_VERSION AS build

@@ -38,19 +35,6 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN apt-get update \
    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -6,10 +6,6 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER

 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 # CUDA architecture to build for (defaults to all supported archs)
@@ -44,19 +40,6 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN apt-get update \
    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,22 +1,12 @@
-ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
+ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04

 ## Build Image

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
-ARG LEVEL_ZERO_VERSION=1.28.2
-ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
 RUN apt-get update && \
-    apt-get install -y git libssl-dev wget ca-certificates && \
-    cd /tmp && \
-    wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \
-    wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \
-    apt-get -o Dpkg::Options::="--force-overwrite" install -y ./level-zero.deb ./level-zero-devel.deb && \
-    rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb
+    apt-get install -y git libssl-dev

 WORKDIR /app

@@ -43,24 +33,11 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-ARG IGC_VERSION=v2.20.5
-ARG IGC_VERSION_FULL=2_2.20.5+19972
-ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
-ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
-ARG IGDGMM_VERSION=22.8.2
+ARG IGC_VERSION=v2.30.1
+ARG IGC_VERSION_FULL=2_2.30.1+20950
+ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
+ARG IGDGMM_VERSION=22.9.0
 RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
@@ -132,3 +109,4 @@ WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

 ENTRYPOINT [ "/app/llama-server" ]
+
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,7 +1,4 @@
 ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A

 FROM ascendai/cann:$ASCEND_VERSION AS build

@@ -31,20 +28,6 @@ RUN echo "Building with static libs" && \

 # TODO: use image with NNRT
 FROM ascendai/cann:$ASCEND_VERSION AS runtime
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /

 ENV LC_ALL=C.utf8
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -6,10 +6,6 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V

 ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

 # MUSA architecture to build for (defaults to all supported archs)
@@ -49,19 +45,6 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ${BASE_MUSA_RUN_CONTAINER} AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN apt-get update \
    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -103,7 +103,6 @@ let
    vulkan-headers
    vulkan-loader
    shaderc
-    spirv-headers
  ];
 in

@@ -147,6 +146,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      ninja
      pkg-config
      git
+      spirv-headers
    ]
    ++ optionals useCuda [
      cudaPackages.cuda_nvcc
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -2,26 +2,10 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
 ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
 ARG UBUNTU_VERSION=24.04

-# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
-ARG IGDGMM_VERSION=22.9.0
-
-# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
-ARG NPU_DRIVER_VERSION=v1.32.0
-ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
-ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
-
-# Optional proxy build arguments
+# Optional proxy build arguments - empty by default
 ARG http_proxy=
 ARG https_proxy=

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
 ## Build Image
 FROM ubuntu:${UBUNTU_VERSION} AS build

@@ -92,61 +76,15 @@ FROM ubuntu:${UBUNTU_VERSION} AS base
 # Pass proxy args to runtime stage
 ARG http_proxy
 ARG https_proxy
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
+    && apt-get install -y libgomp1 libtbb12 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete

-# Install GPU drivers
-ARG IGC_VERSION
-ARG IGC_VERSION_FULL
-ARG COMPUTE_RUNTIME_VERSION
-ARG COMPUTE_RUNTIME_VERSION_FULL
-ARG IGDGMM_VERSION
-RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
-    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && dpkg --install *.deb \
-    && rm -rf /tmp/neo/
-
-# Install NPU drivers
-ARG NPU_DRIVER_VERSION
-ARG NPU_DRIVER_FULL
-ARG LIBZE1_VERSION
-RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
-    && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
-    && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
-    && dpkg --install *.deb \
-    && rm -rf /tmp/npu/
-
-RUN cd /tmp \
-    && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
-    && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
-    && rm libze1_${LIBZE1_VERSION}_amd64.deb
-
 COPY --from=build /app/lib/ /app/

 ### Full (all binaries)
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -7,10 +7,6 @@ ARG AMDGPU_VERSION=7.2.1
 # Target the ROCm build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
 ### Build image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

@@ -61,19 +57,6 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN apt-get update \
    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -1,8 +1,5 @@
 ARG GCC_VERSION=15.2.0
 ARG UBUNTU_VERSION=24.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A

 ### Build Llama.cpp stage
 FROM gcc:${GCC_VERSION} AS build
@@ -55,19 +52,6 @@ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
 ### Base image
 FROM ubuntu:${UBUNTU_VERSION} AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
    apt update -y && \
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,7 +1,4 @@
 ARG UBUNTU_VERSION=26.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A

 FROM ubuntu:$UBUNTU_VERSION AS build

@@ -34,19 +31,6 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN apt-get update \
    && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
    libglvnd0 libgl1 libglx0 libegl1 libgles2 \
--- a/.editorconfig
+++ b/.editorconfig
@@ -45,7 +45,15 @@ insert_final_newline = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset

-[tools/ui/**]
+[tools/server/webui/**]
+indent_style = unset
+indent_size = unset
+end_of_line = unset
+charset = unset
+trim_trailing_whitespace = unset
+insert_final_newline = unset
+
+[tools/server/public/**]
 indent_style = unset
 indent_size = unset
 end_of_line = unset
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,4 @@
+# Treat the generated single-file WebUI build as binary for diff purposes.
+# Git's pack-file delta compression still works (byte-level), but this prevents
+# git diff from printing the entire minified file on every change.
+tools/server/public/index.html -diff
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -12,8 +12,6 @@ body:
        after recreating the CMake build directory and with `-DGGML_CCACHE=OFF`.
        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
        by clearing `~/.cache/ccache` (on Linux).
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: commit
    attributes:
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -1,5 +1,5 @@
 name: Bug (model use)
-description: Something goes wrong when running a model (crashes, garbled outputs, etc.).
+description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
 title: "Eval bug: "
 labels: ["bug-unconfirmed", "model evaluation"]
 body:
@@ -12,8 +12,6 @@ body:
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
        The `llama-completion` binary can be used for simple and reproducible model inference.
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: version
    attributes:
@@ -100,8 +98,8 @@ body:
      label: Relevant log output
      description: >
          Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
-          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
+          For very long logs (thousands of lines), preferably upload them as files instead.
+          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
      value: |
        <details>
        <summary>Logs</summary>
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -10,8 +10,6 @@ body:
        This issue template is intended for miscellaneous bugs that don't fit into any other category.
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: version
    attributes:
@@ -88,8 +86,8 @@ body:
      description: >
          If applicable, please copy and paste any relevant log output, including any generated text.
          If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
-          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
-          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
+          For very long logs (thousands of lines), please upload them as files instead.
+          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
      value: |
        <details>
        <summary>Logs</summary>
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@@ -8,8 +8,6 @@ body:
      value: |
        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)

-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
  - type: checkboxes
    id: prerequisites
    attributes:
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@@ -8,8 +8,6 @@ body:
      value: |
        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)

-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
  - type: checkboxes
    id: research-stage
    attributes:
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@@ -9,8 +9,6 @@ body:
        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.

-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
  - type: textarea
    id: background-description
    attributes:
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -73,10 +73,11 @@ android:
    - changed-files:
        - any-glob-to-any-file:
            - examples/llama.android/**
-server/ui:
+server/webui:
    - changed-files:
        - any-glob-to-any-file:
-            - tools/ui/**
+            - tools/server/webui/**
+            - tools/server/public/**
 server:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -6,7 +6,7 @@

 <!-- You can provide more details and link related discussions here. Delete this section if not applicable -->

-## Requirements
+# Requirements

 <!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->

--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -1,148 +0,0 @@
-name: CI (snapdragon)
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - master
-    paths:
-      - '.github/workflows/build-and-test-snapdragon.yml'
-      - 'ggml/include/ggml-hexagon.h'
-      - 'ggml/src/ggml-hexagon/**'
-      - 'docs/backend/snapdragon/**'
-      - 'scripts/snapdragon/**'
-      - 'CMakePresets.json'
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/build-and-test-snapdragon.yml'
-      - 'ggml/include/ggml-hexagon.h'
-      - 'ggml/src/ggml-hexagon/**'
-      - 'docs/backend/snapdragon/**'
-      - 'scripts/snapdragon/**'
-      - 'CMakePresets.json'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  android-ndk-snapdragon:
-    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Build Llama.CPP for Snapdragon Android
-        id: build_llama_cpp_snapdragon_android
-        run: |
-          cp docs/backend/snapdragon/CMakeUserPresets.json .
-          cmake --preset arm64-android-snapdragon-release -B build
-          cmake --build build
-          cmake --install build --prefix pkg-snapdragon/llama.cpp
-
-      - name: Upload Llama.CPP Snapdragon Android Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-android-arm64-snapdragon
-          path: pkg-snapdragon/llama.cpp
-
-  linux-iot-snapdragon:
-    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1'
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Build Llama.CPP for Snapdragon Linux IoT
-        id: build_llama_cpp_snapdragon_linux
-        run: |
-          cp docs/backend/snapdragon/CMakeUserPresets.json .
-          cmake --preset arm64-linux-snapdragon-release -B build-snapdragon -DGGML_OPENCL=ON
-          cmake --build build-snapdragon -j $(nproc)
-          cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
-
-      - name: Upload Llama.CPP Snapdragon Linux IoT Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_snapdragon_linux.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-linux-arm64-snapdragon
-          path: pkg-snapdragon/llama.cpp
-
-  test-snapdragon-qdc:
-    name: Test on QDC Device (${{ matrix.device }})
-    needs: [android-ndk-snapdragon, linux-iot-snapdragon]
-    runs-on: ubuntu-24.04-arm
-    timeout-minutes: 90
-    strategy:
-      fail-fast: false
-      matrix:
-        device: [SM8750, SM8850, QCS9075M]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-
-      - name: Download build artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: ${{ startsWith(matrix.device, 'QCS') && 'llama-cpp-linux-arm64-snapdragon' || 'llama-cpp-android-arm64-snapdragon' }}
-          path: pkg-snapdragon/llama.cpp
-
-      - name: Set up Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.x'
-          cache: pip
-
-      - name: Install system dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y curl unzip
-
-      - name: Install QDC SDK wheel
-        run: |
-          curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
-          unzip qdc_sdk.zip -d qdc_sdk
-          pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
-
-      - name: Check QDC API key
-        id: check_secret
-        env:
-          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
-        run: echo "has-qdc-key=${{ env.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
-
-      - name: Run QDC tests (${{ matrix.device }})
-        if: steps.check_secret.outputs.has-qdc-key == 'true'
-        run: |
-          python scripts/snapdragon/qdc/run_qdc_jobs.py \
-              --test       all \
-              --pkg-dir    pkg-snapdragon/llama.cpp \
-              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
-              --device     ${{ matrix.device }} \
-              ${{ startsWith(matrix.device, 'QCS') && '--retries 2 --retry-delay 300' || '' }}
-        env:
-          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
-
-      - name: Cleanup
-        if: always()
-        run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -1,24 +1,26 @@
 name: CI (android)

 on:
-  workflow_dispatch:
+  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
-    paths:
-      - '.github/workflows/build-android.yml'
-      - '**/CMakeLists.txt'
-      - '**/.cmake'
-      - '**/*.h'
-      - '**/*.hpp'
-      - '**/*.c'
-      - '**/*.cpp'
+    paths: [
+      '.github/workflows/build-android.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]

  pull_request:
    types: [opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/build-android.yml'
-      - 'examples/llama.android/**'
+    paths: [
+      '.github/workflows/build-android.yml',
+      'examples/llama.android/**'
+    ]

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -65,24 +67,35 @@ jobs:
    defaults:
      run:
        shell: bash
+    strategy:
+      matrix:
+        include:
+          - build: 'arm64-cpu'
+            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
+          - build: 'arm64-snapdragon'
+            defines: '--preset arm64-android-snapdragon-release'

    steps:
      - name: Clone
+        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          lfs: false

-      - name: Build
-        id: ndk_build
+      - name: Build Llama.CPP for Hexagon Android
+        id: build_llama_cpp_hexagon_android
        run: |
-          cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
+          if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
+            cp docs/backend/snapdragon/CMakeUserPresets.json .
+          fi
+          cmake ${{ matrix.defines }} -B build
          cmake --build build
          cmake --install build --prefix pkg-adb/llama.cpp

-      - name: Upload Android Build Artifact
-        if: ${{ always() && steps.ndk_build.outcome == 'success' }}
+      - name: Upload Llama.CPP Hexagon Android Build Artifact
+        if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
        uses: actions/upload-artifact@v6
        with:
-          name: llama-cpp-android-arm64-cpu
+          name: llama-cpp-android-${{ matrix.build }}
          path: pkg-adb/llama.cpp
--- a/.github/workflows/build-cross.yml
+++ b/.github/workflows/build-cross.yml
@@ -301,17 +301,16 @@ jobs:
          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
          cmake -B build -DLLAMA_OPENSSL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DGGML_CPU_REPACK=OFF \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DGGML_CPU_RISCV64_SPACEMIT=ON \
                         -DGGML_RVV=ON \
-                         -DGGML_RV_ZVFH=ON \
                         -DGGML_RV_ZFH=ON \
                         -DGGML_RV_ZICBOP=ON \
                         -DGGML_RV_ZIHINTPAUSE=ON \
-                         -DGGML_RV_ZBA=ON \
+                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake

          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-openvino.yml
+++ b/.github/workflows/build-openvino.yml
@@ -1,120 +0,0 @@
-name: CI (openvino)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-openvino.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-openvino.yml',
-      'ggml/src/ggml-openvino/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-24-openvino:
-    name: ubuntu-24-openvino-${{ matrix.openvino_device }}
-
-    concurrency:
-      group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
-      cancel-in-progress: false
-
-    strategy:
-      matrix:
-        include:
-          - variant: cpu
-            runner: '"ubuntu-24.04"'
-            openvino_device: "CPU"
-          - variant: gpu
-            runner: '["self-hosted","Linux","Intel","OpenVINO"]'
-            openvino_device: "GPU"
-
-    runs-on: ${{ fromJSON(matrix.runner) }}
-
-    env:
-      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        if: runner.environment == 'github-hosted'
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
-          sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
-
-      - name: Use OpenVINO Toolkit Cache
-        if: runner.environment == 'github-hosted'
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenVINO dependencies
-        run: |
-          cd ./openvino_toolkit
-          chmod +x ./install_dependencies/install_openvino_dependencies.sh
-          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source ./openvino_toolkit/setupvars.sh
-          cmake -B build/ReleaseOV -G Ninja \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENVINO=ON
-          time cmake --build build/ReleaseOV --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        # TODO: fix and re-enable the `test-llama-archs` test below
-        run: |
-          cd ${{ github.workspace }}
-          if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
-            export GGML_OPENVINO_DEVICE=GPU
-          fi
-          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -55,24 +55,7 @@ env:
  LLAMA_LOG_TIMESTAMPS: 1

 jobs:
-  determine-tag:
-    name: Determine tag name
-    runs-on: ubuntu-slim
-    outputs:
-      tag_name: ${{ steps.tag.outputs.name }}
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-        env:
-          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-
  ggml-ci-nvidia-cuda:
-    needs: determine-tag
    runs-on: [self-hosted, Linux, NVIDIA]

    steps:
@@ -82,14 +65,11 @@ jobs:

      - name: Test
        id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          nvidia-smi
          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-nvidia-vulkan-cm:
-    needs: determine-tag
    runs-on: [self-hosted, Linux, NVIDIA]

    steps:
@@ -99,14 +79,11 @@ jobs:

      - name: Test
        id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-nvidia-vulkan-cm2:
-    needs: determine-tag
    runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]

    steps:
@@ -116,40 +93,39 @@ jobs:

      - name: Test
        id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  ggml-ci-nvidia-webgpu:
-    runs-on: [self-hosted, Linux, NVIDIA]
+  # TODO: investigate slight precision issues in some operations for test-backend-ops on the WebGPU backend.
+  #ggml-ci-nvidia-webgpu:
+  #  runs-on: [self-hosted, Linux, NVIDIA]

-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
+  #  steps:
+  #    - name: Clone
+  #      id: checkout
+  #      uses: actions/checkout@v6

-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+  #    - name: Dawn Dependency
+  #      id: dawn-depends
+  #      run: |
+  #        DAWN_VERSION="v20260317.182325"
+  #        DAWN_OWNER="google"
+  #        DAWN_REPO="dawn"
+  #        DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
+  #        echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+  #        curl -L -o artifact.tar.gz \
+  #          "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+  #        mkdir dawn
+  #        tar -xvf artifact.tar.gz -C dawn --strip-components=1

-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_WEBGPU=1 \
-          GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-          GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
-            bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #    - name: Test
+  #      id: ggml-ci
+  #      run: |
+  #        GG_BUILD_WEBGPU=1 \
+  #        GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+  #        GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
+  #          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  # TODO: provision AMX-compatible machine
  #ggml-ci-cpu-amx:
@@ -196,7 +172,6 @@ jobs:
  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-mac-metal:
-    needs: determine-tag
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -206,13 +181,10 @@ jobs:

      - name: Test
        id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-mac-webgpu:
-    needs: determine-tag
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -235,14 +207,11 @@ jobs:

      - name: Test
        id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-mac-vulkan:
-    needs: determine-tag
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -252,14 +221,11 @@ jobs:

      - name: Test
        id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-linux-intel-vulkan:
-    needs: determine-tag
    runs-on: [self-hosted, Linux, Intel]

    steps:
@@ -271,14 +237,11 @@ jobs:

      - name: Test
        id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-win-intel-vulkan:
-    needs: determine-tag
    runs-on: [self-hosted, Windows, X64, Intel]

    steps:
@@ -293,7 +256,6 @@ jobs:
          MSYSTEM: UCRT64
          CHERE_INVOKING: 1
          PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
@@ -301,13 +263,8 @@ jobs:
          LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp

  ggml-ci-intel-openvino-gpu-low-perf:
-    needs: determine-tag
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

-    concurrency:
-      group: openvino-gpu-${{ github.head_ref || github.ref }}
-      cancel-in-progress: false
-
    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
      OPENVINO_VERSION_MAJOR: "2026.0"
@@ -333,8 +290,6 @@ jobs:

      - name: Test
        id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          source ./openvino_toolkit/setupvars.sh
          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -1,160 +0,0 @@
-name: CI (sycl)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-sycl.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-sycl.yml',
-      'ggml/src/ggml-sycl/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-
-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
--- a/.github/workflows/build-virtgpu.yml
+++ b/.github/workflows/build-virtgpu.yml
@@ -1,50 +0,0 @@
-name: CI (virtgpu)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-virtgpu.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-virtgpu.yml',
-      'ggml/src/ggml-virtgpu/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ubuntu-24-virtgpu:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VIRTGPU=ON \
-            -DGGML_VIRTGPU_BACKEND=ON
-          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -456,8 +456,7 @@ jobs:
        run: |
          cd build
          # This is using llvmpipe and runs slower than other backends
-          # test-backend-ops is too slow on llvmpipe, skip it
-          ctest -L main -E test-backend-ops --verbose --timeout 900
+          ctest -L main --verbose --timeout 900

  ubuntu-24-webgpu-wasm:
    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
@@ -556,6 +555,186 @@ jobs:
            -DGGML_MUSA=ON
          time cmake --build build --config Release -j $(nproc)

+  ubuntu-22-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-22-sycl
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx
+          time cmake --build build --config Release -j $(nproc)
+
+  ubuntu-22-sycl-fp16:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-22-sycl-fp16
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DGGML_SYCL_F16=ON
+          time cmake --build build --config Release -j $(nproc)
+
+  ubuntu-24-openvino:
+      name: ubuntu-24-openvino-${{ matrix.openvino_device }}
+      strategy:
+        matrix:
+          include:
+            - variant: cpu
+              runner: '"ubuntu-24.04"'
+              openvino_device: "CPU"
+            - variant: gpu
+              runner: '["self-hosted","Linux","X64","Intel"]'
+              openvino_device: "GPU"
+
+      runs-on: ${{ fromJSON(matrix.runner) }}
+
+      env:
+        # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+        OPENVINO_VERSION_MAJOR: "2026.0"
+        OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+
+      steps:
+        - name: Clone
+          id: checkout
+          uses: actions/checkout@v6
+
+        - name: ccache
+          if: runner.environment == 'github-hosted'
+          uses: ggml-org/ccache-action@v1.2.21
+          with:
+            key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
+            evict-old-files: 1d
+            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+        - name: Dependencies
+          id: depends
+          run: |
+            sudo apt-get update
+            sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
+            sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+
+        - name: Use OpenVINO Toolkit Cache
+          if: runner.environment == 'github-hosted'
+          uses: actions/cache@v5
+          id: cache-openvino
+          with:
+            path: ./openvino_toolkit
+            key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+        - name: Setup OpenVINO Toolkit
+          if: steps.cache-openvino.outputs.cache-hit != 'true'
+          uses: ./.github/actions/linux-setup-openvino
+          with:
+            path: ./openvino_toolkit
+            version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+            version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+        - name: Install OpenVINO dependencies
+          run: |
+            cd ./openvino_toolkit
+            chmod +x ./install_dependencies/install_openvino_dependencies.sh
+            echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
+
+        - name: Build
+          id: cmake_build
+          run: |
+            source ./openvino_toolkit/setupvars.sh
+            cmake -B build/ReleaseOV -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DGGML_OPENVINO=ON
+            time cmake --build build/ReleaseOV --config Release -j $(nproc)
+
+        - name: Test
+          id: cmake_test
+          # TODO: fix and re-enable the `test-llama-archs` test below
+          run: |
+            cd ${{ github.workspace }}
+            if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
+              export GGML_OPENVINO_DEVICE=GPU
+            fi
+            ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000

  windows-latest:
    runs-on: windows-2025
@@ -764,6 +943,39 @@ jobs:
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release

+  windows-latest-sycl:
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: windows-latest-sycl
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat

  windows-latest-hip:
    runs-on: windows-2022
--- a/.github/workflows/code-style.yml
+++ b/.github/workflows/code-style.yml
@@ -1,51 +0,0 @@
-name: Code Style Checker
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  model-naming:
-    runs-on: ubuntu-slim
-    steps:
-      - uses: actions/checkout@v6
-      - name: Check model naming conventions
-        run: |
-          python3 - << 'EOF'
-          import re, os, sys
-
-          pairs = re.findall(
-              r'case\s+(LLM_ARCH_\w+)\s*:\s*\n\s+return new (llama_model_\w+)\s*\(',
-              open("src/llama-model.cpp").read())
-
-          errors = []
-          for arch, cls in pairs:
-              suffix  = arch[len("LLM_ARCH_"):]
-              csuffix = cls[len("llama_model_"):]
-              fname   = csuffix.replace("_", "-") + ".cpp"
-
-              if not re.fullmatch(r'[A-Z][A-Z0-9_]*',   suffix):
-                  errors.append(f"{arch}: suffix not upper snake case, example: LLM_ARCH_MY_MODEL")
-
-              if not re.fullmatch(r'[a-z][a-z0-9_]*', csuffix):
-                  errors.append(f"{arch}: class suffix not lower snake case, example: llama_model_my_model")
-
-              elif suffix.lower() != csuffix:
-                  errors.append(f"{arch}: arch/class name mismatch, expected class 'llama_model_{suffix.lower()}' but got '{cls}'")
-
-              elif not os.path.isfile(f"src/models/{fname}"):
-                  errors.append(f"{arch}: expects model file name to be src/models/{fname}, but not found")
-
-          if errors:
-              print('\n'.join(f"  - {e}" for e in errors)); sys.exit(1)
-          print(f"OK: {len(pairs)} mappings validated.")
-          EOF
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -11,11 +11,6 @@ name: Publish Docker image

 on:
  workflow_dispatch: # allows manual triggering
-    inputs:
-      skip_s390x:
-        description: "Skip the s390x build target (useful for fast test runs that do not need the IBM Z runner)"
-        type: boolean
-        default: false
  schedule:
    # Rebuild daily rather than on every push because it is expensive
    - cron: '12 4 * * *'
@@ -69,8 +64,6 @@ jobs:
      - name: Generate build and merge matrices
        id: matrices
        shell: bash
-        env:
-          SKIP_S390X: ${{ inputs.skip_s390x || 'false' }}
        run: |
          set -euo pipefail

@@ -93,11 +86,6 @@ jobs:
          ]
          JSON

-          if [ "${SKIP_S390X}" = "true" ]; then
-            jq 'map(select(.platforms != "linux/s390x"))' build-matrix.json > build-matrix.json.tmp
-            mv build-matrix.json.tmp build-matrix.json
-          fi
-
          BUILD_MATRIX="$(jq -c . build-matrix.json)"
          MERGE_MATRIX="$(jq -c '
            reduce .[] as $entry ({}; .[$entry.tag] |= (
@@ -144,7 +132,6 @@ jobs:
        config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
    steps:
      - name: Check out the repo
-        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
@@ -200,10 +187,6 @@ jobs:
        env:
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

-      - name: Get build date
-        id: build_date
-        run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
-
      - name: Free Disk Space (Ubuntu)
        if: ${{ matrix.config.free_disk_space == true }}
        uses: ggml-org/free-disk-space@v1.3.1
@@ -228,26 +211,13 @@ jobs:
        with:
          context: .
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
+          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
          file: ${{ matrix.config.dockerfile }}
          target: full
          provenance: false
          build-args: |
-            BUILD_DATE=${{ steps.build_date.outputs.date }}
-            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
-            APP_REVISION=${{ steps.checkout.outputs.commit }}
-            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
-            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          annotations: |
-            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
-            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
-            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
-            manifest:org.opencontainers.image.title=llama.cpp
-            manifest:org.opencontainers.image.description=LLM inference in C/C++
-            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
-            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
          # using github experimental cache
          #cache-from: type=gha
          #cache-to: type=gha,mode=max
@@ -265,26 +235,13 @@ jobs:
        with:
          context: .
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
+          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
          file: ${{ matrix.config.dockerfile }}
          target: light
          provenance: false
          build-args: |
-            BUILD_DATE=${{ steps.build_date.outputs.date }}
-            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
-            APP_REVISION=${{ steps.checkout.outputs.commit }}
-            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
-            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          annotations: |
-            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
-            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
-            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
-            manifest:org.opencontainers.image.title=llama.cpp
-            manifest:org.opencontainers.image.description=LLM inference in C/C++
-            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
-            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
          # using github experimental cache
          #cache-from: type=gha
          #cache-to: type=gha,mode=max
@@ -302,26 +259,13 @@ jobs:
        with:
          context: .
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
+          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
          file: ${{ matrix.config.dockerfile }}
          target: server
          provenance: false
          build-args: |
-            BUILD_DATE=${{ steps.build_date.outputs.date }}
-            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
-            APP_REVISION=${{ steps.checkout.outputs.commit }}
-            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
-            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          annotations: |
-            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
-            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
-            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
-            manifest:org.opencontainers.image.title=llama.cpp
-            manifest:org.opencontainers.image.description=LLM inference in C/C++
-            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
-            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
          # using github experimental cache
          #cache-from: type=gha
          #cache-to: type=gha,mode=max
@@ -386,15 +330,10 @@ jobs:

    steps:
      - name: Check out the repo
-        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

-      - name: Get build date
-        id: build_date
-        run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
-
      - name: Download digest metadata
        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
        with:
@@ -422,8 +361,6 @@ jobs:
          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
          PREFIX="${IMAGE_REPO}:"
          SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
-          BUILD_DATE="${{ steps.build_date.outputs.date }}"
-          COMMIT_SHA="${{ steps.checkout.outputs.commit }}"
          TAGS="${{ matrix.config.tag }}"
          ARCHES="${{ matrix.config.arches }}"
          DIGEST_GLOB="/tmp/digests/*.tsv"
@@ -475,21 +412,11 @@ jobs:
                  refs+=("${IMAGE_REPO}@${digest}")
              done

-              local annotations=(
-                  --annotation "index:org.opencontainers.image.created=${BUILD_DATE}"
-                  --annotation "index:org.opencontainers.image.version=${SRC_TAG}"
-                  --annotation "index:org.opencontainers.image.revision=${COMMIT_SHA}"
-                  --annotation "index:org.opencontainers.image.title=llama.cpp"
-                  --annotation "index:org.opencontainers.image.description=LLM inference in C/C++"
-                  --annotation "index:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}"
-                  --annotation "index:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}"
-              )
-
              echo "Creating ${merged_tag} from ${refs[*]}"
-              docker buildx imagetools create "${annotations[@]}" --tag "${merged_tag}" "${refs[@]}"
+              docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}"

              echo "Creating ${merged_versioned_tag} from ${refs[*]}"
-              docker buildx imagetools create "${annotations[@]}" --tag "${merged_versioned_tag}" "${refs[@]}"
+              docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}"
          }

          for tag in $TAGS; do
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -2,6 +2,11 @@ name: EditorConfig Checker

 on:
  workflow_dispatch: # allows manual triggering
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
  push:
    branches:
      - master
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -29,10 +29,10 @@ jobs:
      uses: actions/setup-python@v6
      with:
        python-version: '3.11'
-        pip-install: poetry==2.4.0
    - name: Install dependencies
      run: |
        cd gguf-py
+        python -m pip install poetry==2.3.2
        poetry install

    - name: Build package
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -31,7 +31,7 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.35
+          pip-install: -r requirements/requirements-all.txt ty==0.0.26
      # - name: Type-check with Pyright
      #   uses: jakebailey/pyright-action@v2
      #   with:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -36,9 +36,7 @@ env:
  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"

 jobs:
-
  macOS-cpu:
-
    strategy:
      matrix:
        include:
@@ -66,13 +64,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -100,7 +91,7 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -109,7 +100,6 @@ jobs:
          name: llama-bin-macos-${{ matrix.build }}.tar.gz

  ubuntu-cpu:
-
    strategy:
      matrix:
        include:
@@ -129,13 +119,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
        if: ${{ matrix.build != 's390x' }}
        uses: ggml-org/ccache-action@v1.2.21
@@ -177,7 +160,7 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -186,7 +169,6 @@ jobs:
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

  ubuntu-vulkan:
-
    strategy:
      matrix:
        include:
@@ -204,13 +186,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -253,7 +228,7 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -262,7 +237,6 @@ jobs:
          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz

  android-arm64:
-
    runs-on: ubuntu-latest

    env:
@@ -275,13 +249,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -330,7 +297,7 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -339,7 +306,6 @@ jobs:
          name: llama-bin-android-arm64.tar.gz

  ubuntu-24-openvino:
-
    runs-on: ubuntu-24.04

    outputs:
@@ -361,13 +327,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -418,7 +377,7 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/ReleaseOV/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/ReleaseOV/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -427,7 +386,6 @@ jobs:
          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz

  windows-cpu:
-
    runs-on: windows-2025

    strategy:
@@ -442,13 +400,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -487,7 +438,6 @@ jobs:
          name: llama-bin-win-cpu-${{ matrix.arch }}.zip

  windows:
-
    runs-on: windows-2025

    env:
@@ -511,13 +461,6 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -577,7 +520,6 @@ jobs:
          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip

  windows-cuda:
-
    runs-on: windows-2022

    strategy:
@@ -589,13 +531,6 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: Install ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -656,7 +591,6 @@ jobs:
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

  windows-sycl:
-
    runs-on: windows-2022

    defaults:
@@ -664,44 +598,15 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -709,6 +614,10 @@ jobs:
          variant: ccache
          evict-old-files: 1d

+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
      - name: Build
        id: cmake_build
        shell: cmd
@@ -736,13 +645,6 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
-          if [ -n "$ZE_LOADER_DLL" ]; then
-            echo "Using Level Zero loader: $ZE_LOADER_DLL"
-            cp "$ZE_LOADER_DLL" ./build/bin
-          else
-            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
-          fi

          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
@@ -768,102 +670,7 @@ jobs:
          path: llama-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

-  ubuntu-24-sycl:
-
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-
  ubuntu-22-rocm:
-
    runs-on: ubuntu-22.04

    strategy:
@@ -880,13 +687,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: Free up disk space
        uses: ggml-org/free-disk-space@v1.3.1
        with:
@@ -965,7 +765,7 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -974,7 +774,6 @@ jobs:
          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz

  windows-hip:
-
    runs-on: windows-2022

    env:
@@ -991,13 +790,6 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: Grab rocWMMA package
        id: grab_rocwmma
        run: |
@@ -1225,7 +1017,7 @@ jobs:
      - name: Pack artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -1253,15 +1045,11 @@ jobs:
      - ubuntu-cpu
      - ubuntu-vulkan
      - ubuntu-24-openvino
-      - ubuntu-24-sycl
      - android-arm64
      - macOS-cpu
      - ios-xcode-build
      - openEuler-cann

-    outputs:
-      tag_name: ${{ steps.tag.outputs.name }}
-
    steps:
      - name: Clone
        id: checkout
@@ -1345,8 +1133,6 @@ jobs:
            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)

            **Android:**
            - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
@@ -1387,15 +1173,3 @@ jobs:
                });
              }
            }
-
-  ui-publish:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    needs:
-      - release
-
-    uses: ./.github/workflows/ui-publish.yml
-    with:
-      version_tag: ${{ needs.release.outputs.tag_name }}
-    secrets:
-      hf_token: ${{ secrets.HF_TOKEN_UI_STATIC_OUTPUT }}
--- a/.github/workflows/server-sanitize.yml
+++ b/.github/workflows/server-sanitize.yml
@@ -67,13 +67,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: Build
        id: cmake_build
        run: |
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -67,13 +67,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: Build
        id: cmake_build
        run: |
@@ -130,68 +123,3 @@ jobs:
  #          pip install -r requirements.txt
  #          export ${{ matrix.extra_args }}
  #          pytest -v -x -m "not slow"
-
-  server-kleidiai:
-    runs-on: ah-ubuntu_22_04-c8g_8x
-
-    name: server-kleidiai (${{ matrix.wf_name }})
-    strategy:
-      matrix:
-        include:
-          - build_type: Release
-            extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON"
-            extra_args: ""
-            wf_name:    "CPUx1, kleidiai"
-      fail-fast: false
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          set -euxo pipefail
-          sudo apt-get update
-          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
-          apt-get install -y \
-           build-essential \
-           libssl-dev \
-           python3-venv \
-           gpg \
-           wget \
-           time \
-           git-lfs
-
-          git lfs install
-
-          # install the latest cmake
-          sudo install -d /usr/share/keyrings
-          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
-           | gpg --dearmor \
-           | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
-          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
-           | sudo tee /etc/apt/sources.list.d/kitware.list
-          sudo apt-get update
-          sudo apt-get install -y cmake
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }}
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-        run: |
-          cd tools/server/tests
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-          export ${{ matrix.extra_args }}
-          pytest -v -x -m "not slow"
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -1,7 +1,7 @@
-name: CI (UI)
+name: Server WebUI

 on:
-  workflow_dispatch:
+  workflow_dispatch: # allows manual triggering
    inputs:
      sha:
        description: 'Commit SHA1 to build'
@@ -11,16 +11,18 @@ on:
    branches:
      - master
    paths: [
-      '.github/workflows/ui-ci.yml',
-      'tools/ui/**.*',
-      'tools/server/tests/**.*'
+      '.github/workflows/server-webui.yml',
+      'tools/server/webui/**.*',
+      'tools/server/tests/**.*',
+      'tools/server/public/**'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
-      '.github/workflows/ui-ci.yml',
-      'tools/ui/**.*',
-      'tools/server/tests/**.*'
+      '.github/workflows/server-webui.yml',
+      'tools/server/webui/**.*',
+      'tools/server/tests/**.*',
+      'tools/server/public/**'
    ]

 env:
@@ -34,14 +36,9 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  ui-build:
-    name: Build static output
-    uses: ./.github/workflows/ui-build.yml
-
-  ui-checks:
-    name: UI Checks
-    needs: ui-build
-    runs-on: ubuntu-latest
+  webui-check:
+    name: WebUI Checks
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
    continue-on-error: true
    steps:
      - name: Checkout code
@@ -54,89 +51,58 @@ jobs:
        id: node
        uses: actions/setup-node@v6
        with:
-          node-version: "24"
+          node-version: "22"
          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
+          cache-dependency-path: "tools/server/webui/package-lock.json"

      - name: Install dependencies
        id: setup
        if: ${{ steps.node.conclusion == 'success' }}
        run: npm ci
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Run type checking
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run check
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Run linting
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run lint
-        working-directory: tools/ui
-
-      - name: Install Playwright browsers
-        id: playwright
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npx playwright install --with-deps
-        working-directory: tools/ui
-
-      - name: Run Client tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:client
-        working-directory: tools/ui
-
-      - name: Run Unit tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:unit
-        working-directory: tools/ui
-
-  e2e-tests:
-    name: E2E Tests
-    needs: ui-build
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        id: node
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: Install dependencies
-        id: setup
-        if: ${{ steps.node.conclusion == 'success' }}
-        run: npm ci
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Build application
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run build
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Install Playwright browsers
        id: playwright
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npx playwright install --with-deps
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Build Storybook
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run build-storybook
-        working-directory: tools/ui
+        working-directory: tools/server/webui
+
+      - name: Run Client tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:client
+        working-directory: tools/server/webui
+
+      - name: Run Unit tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:unit
+        working-directory: tools/server/webui

      - name: Run UI tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:ui -- --testTimeout=60000
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Run E2E tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:e2e
-        working-directory: tools/ui
+        working-directory: tools/server/webui
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -93,13 +93,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: Build
        id: cmake_build
        run: |
@@ -142,11 +135,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-
      - name: Build
        id: cmake_build
        run: |
--- a/.github/workflows/ui-build.yml
+++ b/.github/workflows/ui-build.yml
@@ -1,44 +0,0 @@
-name: UI Build
-
-on:
-  workflow_call:
-
-jobs:
-  build:
-    name: Build static output
-    runs-on: ubuntu-slim
-    env:
-      BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: Install dependencies
-        run: npm ci
-        working-directory: tools/ui
-
-      - name: Build application
-        run: npm run build
-        working-directory: tools/ui
-
-      - name: Generate checksums
-        run: |
-          cd build/tools/ui/dist
-          for f in *; do
-            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
-          done
-
-      - name: Upload built UI
-        uses: actions/upload-artifact@v6
-        with:
-          name: ui-build
-          path: build/tools/ui/dist/
-          retention-days: 1
--- a/.github/workflows/ui-publish.yml
+++ b/.github/workflows/ui-publish.yml
@@ -1,70 +0,0 @@
-name: UI Publish
-
-on:
-  workflow_call:
-    inputs:
-      version_tag:
-        description: 'Version tag to publish under (e.g., b1234)'
-        required: true
-        type: string
-    secrets:
-      hf_token:
-        description: 'Hugging Face token with write access'
-        required: true
-
-jobs:
-  build:
-    name: Build static output
-    uses: ./.github/workflows/ui-build.yml
-
-  publish:
-    name: Publish UI Static Output
-    needs: build
-    runs-on: ubuntu-24.04-arm
-
-    permissions:
-      contents: read
-
-    env:
-      HF_BUCKET_NAME: ${{ vars.HF_BUCKET_UI_STATIC_OUTPUT }}
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 1
-
-      - name: Download UI build artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: ui-build
-          path: build/tools/ui/dist/
-
-      - name: Install Hugging Face Hub CLI
-        run: pip install -U huggingface_hub
-
-      - name: Authenticate with Hugging Face
-        run: hf auth login --token ${{ secrets.hf_token }}
-
-      - name: Sync built files to Hugging Face bucket (version tag)
-        run: |
-          # Upload the built files to the Hugging Face bucket under the release version
-          hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
-
-      - name: Sync built files to Hugging Face bucket (latest)
-        run: |
-          # Also upload to the 'latest' directory for fallback downloads
-          hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
-
-      - name: Verify upload
-        run: |
-          # List the files in the bucket to verify the upload
-          hf buckets list hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} -R -h
-
-      - name: Clean up root-level files
-        run: |
-          # Clean up any old root-level files from previous non-versioned deployments
-          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/index.html --yes 2>/dev/null || true
-          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/bundle.js --yes 2>/dev/null || true
-          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/bundle.css --yes 2>/dev/null || true
-          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/loading.html --yes 2>/dev/null || true
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@
 /.vscode/
 /nppBackup

+
 # Coverage

 /gcovr-report/
@@ -73,7 +74,6 @@
 !/models/templates

 # Zig
-
 /zig-out/
 /zig-cache/

@@ -92,12 +92,11 @@
 !/examples/sycl/*.bat
 !/examples/sycl/*.sh

-# Server Web UI temporary files (+ legacy directory)
-
+# Server Web UI temporary files
 /tools/server/webui/node_modules
 /tools/server/webui/dist
-/tools/ui/node_modules
-/tools/ui/dist
+# we no longer use gz for index.html
+/tools/server/public/index.html.gz

 # Python

@@ -105,16 +104,11 @@
 __pycache__/
 */poetry.lock
 poetry.toml
-poetry.lock
-uv.lock

 # Nix
-
-flake.lock
 /result

 # Test binaries
-
 /tests/test-backend-ops
 /tests/test-double-float
 /tests/test-grad0
@@ -130,7 +124,6 @@ flake.lock
 /tests/test-tokenizer-1-spm

 # Scripts
-
 !/scripts/install-oneapi.bat

 # Generated by scripts
@@ -139,24 +132,16 @@ flake.lock
 /wikitext-2-raw/

 # Test models for lora adapters
-
 /lora-tests

 # Local scripts
-
 /run-vim.sh
 /run-chat.sh
 /run-spec.sh
 /.ccache/

 # IDE
-
 /*.code-workspace
 /.windsurf/
 # emscripten
 a.out.*
-
-# AGENTS
-
-AGENTS.local.md
-.pi/SYSTEM.md
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -1,36 +0,0 @@
-You are a coding agent. Here are some very important rules that you must follow:
-
-General:
- By very precise and concise when writing code, comments, explanations, etc.
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
- Don't try to build or run the code unless you are explicitly asked to do so
- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
-
-Coding:
- When in doubt, always refer to the CONTRIBUTING.md file of the project
- When referencing issues or PRs in comments, use the format:
-  - C/C++ code: `// ref: <url>`
-  - Other (CMake, etc.): `# ref: <url>`
-
-Pull requests (PRs):
- New branch names are prefixed with "gg/"
- Before opening a pull request, ask the user to confirm the description
- When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
- Always create the pull requests in draft mode
-
-Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
- Do not explicitly set the git author in commits - rely on the default git config
- Always use `--no-gpg-sign` when committing
- Never `git push` without explicit confirmation from the user
-
-Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md)
- [PEG parser](docs/development/parsing.md)
- [Auto parser](docs/autoparser.md)
- [Jinja engine](common/jinja/README.md)
- [PR template](.github/pull_request_template.md)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,24 +104,13 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
 option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})

 # extra artifacts
-option(LLAMA_BUILD_TESTS            "llama: build tests"                                                                            ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_TOOLS            "llama: build tools"                                                                            ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES         "llama: build examples"                                                                         ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER           "llama: build server example"                                                                   ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_UI                "llama: build the embedded Web UI for server"                                                   ON)
-option(LLAMA_USE_PREBUILT_UI         "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)"             ON)
-
-# Backward compat: when old var is set but new one isn't, forward the value
-if(DEFINED LLAMA_BUILD_WEBUI)
-    set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI})
-    message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
-endif()
-if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
-    set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
-    message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
-endif()
-option(LLAMA_TOOLS_INSTALL          "llama: install tools"                                                                          ${LLAMA_TOOLS_INSTALL_DEFAULT})
-option(LLAMA_TESTS_INSTALL          "llama: install tests"                                                                          ON)
+option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_WEBUI    "llama: build the embedded Web UI for server"  ON)
+option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
+option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)

 # 3rd party libs
 option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
@@ -281,6 +270,18 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)

+install(
+    FILES convert_hf_to_gguf.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 configure_file(cmake/llama.pc.in
        "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        @ONLY)
--- a/16
+++ b/16
@@ -15,7 +15,7 @@
 # ggml-org/llama-common     : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
 # ggml-org/llama-mtmd       : ngxson
 # ggml-org/llama-server     : ggerganov, ngxson, allozaur, angt, ServeurpersoCom
-# ggml-org/llama-ui           : allozaur
+# ggml-org/llama-webui      : allozaur

 /.devops/*.Dockerfile                   @ngxson
 /.github/actions/                       @ggml-org/ci
@@ -23,10 +23,8 @@
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
 /common/                                @ggml-org/llama-common
-/common/fit.*                           @JohannesGaessler
 /common/jinja/                          @CISC
 /common/ngram-map.*                     @srogmann
-/conversion/                            @CISC
 /convert_*.py                           @CISC
 /docs/backend/snapdragon/               @ggml-org/ggml-hexagon
 /examples/batched.swift/                @ggerganov
@@ -54,30 +52,28 @@
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
 /ggml/include/                          @ggerganov
-/ggml/src/ggml-backend-meta.cpp         @JohannesGaessler
 /ggml/src/ggml-cann/                    @ggml-org/ggml-cann
 /ggml/src/ggml-common.h                 @ggerganov
 /ggml/src/ggml-cpu/                     @ggerganov
 /ggml/src/ggml-cpu/spacemit/            @alex-spacemit
 /ggml/src/ggml-cuda/                    @ggml-org/ggml-cuda
-/ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
 /ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
-/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
 /ggml/src/ggml-hip/                     @IMbackK
+/ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
 /ggml/src/ggml-impl.h                   @ggerganov
 /ggml/src/ggml-metal/                   @ggml-org/ggml-metal
 /ggml/src/ggml-opencl/                  @ggml-org/ggml-opencl
-/ggml/src/ggml-openvino/                @cavusmustafa @wine99
+/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @ggml-org/ggml-rpc
 /ggml/src/ggml-sycl/                    @ggml-org/ggml-sycl
 /ggml/src/ggml-threading.*              @ggerganov
-/ggml/src/ggml-virtgpu/                 @kpouget
 /ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
+/ggml/src/ggml-virtgpu/                 @kpouget
 /ggml/src/ggml-webgpu/                  @ggml-org/ggml-webgpu
 /ggml/src/ggml-zdnn/                    @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml-zendnn/                  @avinashcpandey @Jiten1parmar @z-vishal
+/ggml/src/ggml-openvino/                @cavusmustafa @wine99
 /ggml/src/ggml.c                        @ggerganov
 /ggml/src/ggml.cpp                      @ggerganov
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
@@ -108,7 +104,7 @@
 /tools/rpc/                             @ggml-org/ggml-rpc
 /tools/server/*                         @ggml-org/llama-server # no subdir
 /tools/server/tests/                    @ggml-org/llama-server
-/tools/ui/                              @ggml-org/llama-ui
+/tools/server/webui/                    @ggml-org/llama-webui
 /tools/tokenize/                        @ggerganov
 /tools/tts/                             @ggerganov
 /vendor/                                @ggerganov
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -46,9 +46,7 @@ Before submitting your PR:
    - provide KL divergence data calculated vs. the FP16/BF16 (whichever is the native precision) version for both the new type as well as types of similar size
    - provide [performance data](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) for the new type in comparison to types of similar size on pure CPU
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor
-    - Limit your open PRs to 1
-    - Do not submit trivial fixes (e.g. typos, formatting changes)
+- If you are a new contributor, limit your open PRs to 1.

 After submitting your PR:
 - Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
--- a/README.md
+++ b/README.md
@@ -172,7 +172,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Ruby: [docusealco/rllama](https://github.com/docusealco/rllama)
 - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
@@ -280,7 +279,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Metal](docs/build.md#metal-build) | Apple Silicon |
 | [BLAS](docs/build.md#blas-build) | All |
 | [BLIS](docs/backend/BLIS.md) | All |
-| [SYCL](docs/backend/SYCL.md) | Intel GPU |
+| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
 | [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
 | [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
@@ -530,7 +529,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
 - [How to build](docs/build.md)
 - [Running on Docker](docs/docker.md)
 - [Build on Android](docs/android.md)
- [Multi-GPU usage](docs/multi-gpu.md)
 - [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)

--- a/ci/run.sh
+++ b/ci/run.sh
@@ -117,12 +117,6 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
    # if on Mac, disable METAL
    if [[ "$OSTYPE" == "darwin"* ]]; then
        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
-
-        MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
-        MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
-        if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
-            CMAKE_EXTRA="${CMAKE_EXTRA} -DSPIRV-Headers_DIR=${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers"
-        fi
    fi

    # Build shared libs on Windows
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -7,7 +7,7 @@ set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)

 set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")

 find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)

--- a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
+++ b/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
@@ -24,6 +24,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
-set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_CXX_FLAGS}")
+set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -73,8 +73,6 @@ add_library(${TARGET}
    debug.h
    download.cpp
    download.h
-    fit.cpp
-    fit.h
    hf-cache.cpp
    hf-cache.h
    http.h
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -25,8 +25,7 @@ struct common_arg {
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
    std::string help;
-    bool is_sampling = false; // is current arg a sampling param?
-    bool is_spec = false; // is current arg a speculative decoding param?
+    bool is_sparam = false; // is current arg a sampling param?
    bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
    void (*handler_void)   (common_params & params) = nullptr;
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
@@ -75,8 +74,7 @@ struct common_arg {
    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
    common_arg & set_env(const char * env);
-    common_arg & set_sampling();
-    common_arg & set_spec();
+    common_arg & set_sparam();
    common_arg & set_preset_only();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
@@ -129,8 +127,5 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

-// Populate model paths (main model, mmproj, etc) from -hf if necessary
-void common_params_handle_models(common_params & params, llama_example curr_ex);
-
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -43,33 +43,11 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
                                                  const autoparser &              autoparser) {
    // Create the result structure
    common_chat_params data;
-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt(tmpl, inputs);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.preserved_tokens  = autoparser.preserved_tokens;
+    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
+    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.preserved_tokens = autoparser.preserved_tokens;

-    std::string parser_generation_prompt = data.generation_prompt;
-
-    if (inputs.continue_final_message != COMMON_CHAT_CONTINUATION_NONE && !inputs.continue_msg.empty()) {
-        // Build up generation prompt manually
-        const auto & msg = inputs.continue_msg;
-
-        if (!autoparser.reasoning.start.empty()) {
-            data.generation_prompt = data.generation_prompt.substr(0, data.generation_prompt.find(autoparser.reasoning.start));
-            data.generation_prompt += autoparser.reasoning.start + msg.reasoning_content;
-            if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-                data.generation_prompt += autoparser.reasoning.end;
-            }
-        }
-
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
-    auto parser = autoparser.build_parser(inputs, parser_generation_prompt);
+    auto parser = autoparser.build_parser(inputs);
    data.parser = parser.save();

    // Build grammar if tools are present
@@ -109,7 +87,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
    return data;
 }

-common_peg_arena autoparser::build_parser(const generation_params & inputs, const std::string & generation_prompt) const {
+common_peg_arena autoparser::build_parser(const generation_params & inputs) const {
    if (!analysis_complete) {
        throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
    }
@@ -143,7 +121,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs, cons
        } else {
            parser = content.build_parser(ctx);
        }
-        return pure_content ? p.prefix(generation_prompt, reasoning.start) + parser : p.prefix(generation_prompt, reasoning.start) << parser;
+        return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser;
    });
 }

@@ -158,10 +136,10 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
        if (!end.empty()) {
            if (!start.empty()) {
                // Standard tag-based: optional(<think>reasoning</think>)
-                return p.optional(p.optspace(start) + p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
+                return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
            }
            // Delimiter-style (empty start)
-            return p.optional(p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
+            return p.optional(p.reasoning(p.until(end)) + end + p.space());
        }
    }

@@ -208,6 +186,7 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
 common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
+    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    // Build effective field names with dot notation if function_field is set
    std::string name_field = format.name_field;
@@ -246,7 +225,8 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        tool_start = format.per_call_start;
    }

-    return ctx.reasoning_parser + p.optional(p.content(p.until(tool_start))) + tools_parser + p.end();
+    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
+           p.end();
 }

 common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
@@ -290,6 +270,7 @@ common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p,
 common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
+    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    common_peg_parser tool_choice = p.choice();

@@ -355,12 +336,14 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context

    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
+    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
+           p.end();
 }

 common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
+    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));

@@ -391,7 +374,9 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                                           arguments.name_suffix) +
                           arguments.value_prefix +
                           (schema_info.resolves_to_string(param_schema) ?
-                                p.tool_arg_string_value(until_suffix) :
+                                p.tool_arg_string_value(p.schema(until_suffix,
+                                                                 "tool-" + name + "-arg-" + param_name + "-schema",
+                                                                 param_schema, true)) :
                                p.tool_arg_json_value(p.schema(
                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
                                    p.space()) +
@@ -486,7 +471,8 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte

    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
+    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
+           p.end();
 }

 }  // namespace autoparser
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -60,21 +60,16 @@ struct generation_params {
    common_reasoning_format               reasoning_format    = COMMON_REASONING_FORMAT_AUTO;
    bool                                  stream              = true;
    std::string                           grammar;
-    bool                                  add_generation_prompt  = false;
-    common_chat_continuation              continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
-    common_chat_msg                       continue_msg;
-    bool                                  enable_thinking        = true;
-    std::chrono::system_clock::time_point now                    = std::chrono::system_clock::now();
+    bool                                  add_generation_prompt = false;
+    bool                                  enable_thinking       = true;
+    std::chrono::system_clock::time_point now                   = std::chrono::system_clock::now();
+    std::string                           generation_prompt;
    json                                  extra_context;
    bool                                  add_bos       = false;
    bool                                  add_eos       = false;
    bool                                  is_inference  = true;
    bool                                  add_inference = false;
    bool                                  mark_input    = true;  // whether to mark input strings in the jinja context
-
-    bool has_continuation() const {
-        return continue_final_message != COMMON_CHAT_CONTINUATION_NONE && !continue_msg.empty();
-    }
 };

 // ============================================================================
@@ -391,7 +386,7 @@ struct autoparser {
    void analyze_template(const common_chat_template & tmpl);

    // Build the PEG parser for this template
-    common_peg_arena build_parser(const generation_params & inputs, const std::string & generation_prompt) const;
+    common_peg_arena build_parser(const generation_params & inputs) const;

  private:
    // Collect tokens from entire analysis to preserve
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -296,7 +296,7 @@ void analyze_reasoning::compare_reasoning_presence() {
            return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
        });
        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.tag("post", (p.space() + p.marker() + p.space())) + p.rest();
+            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
        });
        // try the more aggressive parse first, if it fails, fall back to the delimiter one
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@@ -306,11 +306,11 @@ void analyze_reasoning::compare_reasoning_presence() {
        if (result.result.success()) {
            if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
                mode = reasoning_mode::TAG_BASED;
-                start = result.tags["pre"];
-                end   = result.tags["post"];
+                start = trim_leading_whitespace(result.tags["pre"]);
+                end   = trim_trailing_whitespace(result.tags["post"]);
            } else if (!result.tags["post"].empty()) {
                mode = reasoning_mode::TAG_BASED;
-                end = result.tags["post"];
+                end = trim_trailing_whitespace(result.tags["post"]);
            }
        }
    }
@@ -342,7 +342,7 @@ void analyze_reasoning::compare_thinking_enabled() {
    if (left_trimmed.empty() && !diff.right.empty()) {
        if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
            if (start.empty()) {
-                start = diff.right;
+                start = trim_leading_whitespace(diff.right);
                mode  = reasoning_mode::TAG_BASED;
            }
        }
@@ -353,7 +353,7 @@ void analyze_reasoning::compare_thinking_enabled() {
                if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
                    start = seg[seg.size() - 2].value;
                }
-                end = diff.left;
+                end = trim_trailing_whitespace(diff.left);
                mode = reasoning_mode::TAG_BASED;
            }
        }
@@ -445,14 +445,14 @@ void analyze_reasoning::compare_reasoning_scope() {
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
        if (result.result.success()) {
            start = result.tags["pre"];
-            end = result.tags["post"];
+            end = trim_trailing_whitespace(result.tags["post"]);
        } else {
            auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
                return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
            });
            result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
            if (result.result.success()) {
-                end = result.tags["post"];
+                end = trim_trailing_whitespace(result.tags["post"]);
            } else {
                LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
                mode = reasoning_mode::NONE;
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -358,7 +358,35 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
            if (is_potential_container) {
                value_content = normalize_container_value(value_content);
            }
-            value_to_add += value_content;
+
+            // Try to parse as JSON value (number, bool, null, object, array)
+            try {
+                ordered_json parsed = ordered_json::parse(value_content);
+                if (parsed.is_string()) {
+                    // Don't add closing quote yet (added by arg_close) for monotonic streaming
+                    std::string escaped = parsed.dump();
+                    if (!escaped.empty() && escaped.back() == '"') {
+                        escaped.pop_back();
+                    }
+                    value_to_add          = escaped;
+                    closing_quote_pending = true;
+                } else {
+                    // Non-string values: use raw content to preserve whitespace for monotonicity
+                    value_to_add = value_content;
+                }
+            } catch (...) {
+                if (node.is_partial && is_potential_container) {
+                    // Partial container: pass through the already-normalized content
+                    value_to_add = value_content;
+                } else {
+                    // Not valid JSON - treat as string value
+                    if (!closing_quote_pending) {
+                        value_to_add          = "\"";
+                        closing_quote_pending = true;
+                    }
+                    value_to_add += escape_json_string_inner(value_content);
+                }
+            }
        }

        args_target() += value_to_add;
@@ -785,33 +813,7 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s
    if (delimiter.empty()) {
        return literal(s);
    }
-    return literal(s.substr(0, s.find(delimiter)));
-}
-
-common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
-    auto parser = eps();
-    size_t end_of_prefix_space = tag.size();
-    size_t start_of_suffix_space = tag.size();
-    for (size_t i = 0; i < tag.size(); i++) {
-        if (!std::isspace(tag[i])) {
-            end_of_prefix_space = i;
-            break;
-        }
-    }
-    for (size_t i = tag.size(); i > 0; i--) {
-        if (!std::isspace(tag[i - 1])) {
-            start_of_suffix_space = i;
-            break;
-        }
-    }
-    for (size_t i = 0; i < end_of_prefix_space; i++) {
-        parser += optional(literal(std::string(1, tag[i])));
-    }
-    parser += literal(tag.substr(end_of_prefix_space, start_of_suffix_space - end_of_prefix_space));
-    for (size_t i = start_of_suffix_space; i < tag.size(); i++) {
-        parser += optional(literal(std::string(1, tag[i])));
-    }
-    return parser;
+    return literal(s.substr(0, s.rfind(delimiter)));
 }

 common_peg_parser common_chat_peg_builder::standard_json_tools(
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -90,15 +90,12 @@ class common_chat_peg_builder : public common_peg_parser_builder {

    // Use for schema-declared string types - won't be treated as potential JSON container
    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
-    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_VALUE, p); }
+    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }


    // Return a parser that parses the prefix of a string, up to a given delimiter.
    common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});

-    // Return a parser that parses all elements of tag, but leading and trailing spaces are optional
-    common_peg_parser optspace(const std::string & tag);
-
    // Legacy-compatible helper for building standard JSON tool calls
    // Used by tests and manual parsers
    // name_key/args_key: JSON key names for function name and arguments
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -70,26 +70,6 @@ static bool has_content_or_tool_calls(const common_chat_msg & msg) {
    return !msg.content.empty() || !msg.tool_calls.empty();
 }

-std::string common_chat_msg::render_content(const std::string & delimiter) const {
-    if (!content.empty() && !content_parts.empty()) {
-        throw std::runtime_error("Cannot specify both content and content_parts");
-    }
-    if (!content.empty()) {
-        return content;
-    }
-
-    std::string text;
-    for (const auto & part : content_parts) {
-        if (part.type == "text") {
-            if (!text.empty()) {
-                text += delimiter;
-            }
-            text += part.text;
-        }
-    }
-    return text;
-}
-
 json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
    if (!content.empty() && !content_parts.empty()) {
        throw std::runtime_error("Cannot specify both content and content_parts");
@@ -100,7 +80,7 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
    if (!content.empty()) {
        jmsg["content"] = content;
    } else if (!content_parts.empty()) {
-        if (concat_typed_text || contains_media()) {
+        if (concat_typed_text) {
            std::string text;
            bool last_was_media_marker = false;
            // join parts with newline, do not add newline before or after media markers
@@ -417,25 +397,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
    return render_message_to_json(msgs, c);
 }

-json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
-    if (tools.empty()) {
-        return json();
-    }
-
-    auto result = json::array();
-    for (const auto & tool : tools) {
-        result.push_back({
-            { "type",     "function" },
-            { "function", {
-                { "name", tool.name },
-                { "description", tool.description },
-                { "parameters", json::parse(tool.parameters) },
-            }},
-        });
-    }
-    return result;
-}
-
 std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
    std::vector<common_chat_tool> result;

@@ -471,20 +432,54 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
    return result;
 }

-common_chat_continuation common_chat_continuation_parse(const nlohmann::ordered_json & value) {
-    if (value.is_boolean() && value.get<bool>()) {
-        return COMMON_CHAT_CONTINUATION_AUTO;
+json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
+    if (tools.empty()) {
+        return json();
    }
-    if (value.is_string()) {
-        auto value_str = value.get<std::string>();
-        if (value_str == "reasoning_content") {
-            return COMMON_CHAT_CONTINUATION_REASONING;
-        }
-        if (value_str == "content") {
-            return COMMON_CHAT_CONTINUATION_CONTENT;
-        }
+
+    auto result = json::array();
+    for (const auto & tool : tools) {
+        result.push_back({
+            { "type",     "function" },
+            { "function",
+             {
+                  { "name", tool.name },
+                  { "description", tool.description },
+                  { "parameters", json::parse(tool.parameters) },
+              }                      },
+        });
    }
-    return COMMON_CHAT_CONTINUATION_NONE;
+    return result;
+}
+
+json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
+    json delta = json::object();
+    if (!diff.reasoning_content_delta.empty()) {
+        delta["reasoning_content"] = diff.reasoning_content_delta;
+    }
+    if (!diff.content_delta.empty()) {
+        delta["content"] = diff.content_delta;
+    }
+    if (diff.tool_call_index != std::string::npos) {
+        json tool_call;
+        tool_call["index"] = diff.tool_call_index;
+        if (!diff.tool_call_delta.id.empty()) {
+            tool_call["id"]   = diff.tool_call_delta.id;
+            tool_call["type"] = "function";
+        }
+        if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
+            json function = json::object();
+            if (!diff.tool_call_delta.name.empty()) {
+                function["name"] = diff.tool_call_delta.name;
+            }
+            if (!diff.tool_call_delta.arguments.empty()) {
+                function["arguments"] = diff.tool_call_delta.arguments;
+            }
+            tool_call["function"] = function;
+        }
+        delta["tool_calls"] = json::array({ tool_call });
+    }
+    return delta;
 }

 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
@@ -580,26 +575,6 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
    return tmpls->has_explicit_template;
 }

-// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
-// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
-static bool is_lfm2_template(const std::string & src) {
-    return src.find("<|tool_list_start|>") != std::string::npos &&
-           src.find("<|tool_list_end|>")   != std::string::npos;
-}
-
-common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
-    common_chat_prompt_preset asr_preset;
-    asr_preset.system = "";
-    asr_preset.user   = "Transcribe audio to text";
-
-    if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
-        asr_preset.system = "Perform ASR.";
-        asr_preset.user   = "";
-    }
-
-    return asr_preset;
-}
-
 std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
    if (!variant.empty()) {
        if (variant == "tool_use") {
@@ -847,36 +822,6 @@ std::string common_chat_template_direct_apply(
    return common_chat_template_direct_apply_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt);
 }

-static std::string common_chat_template_generation_prompt_impl(
-    const common_chat_template & tmpl,
-    const autoparser::generation_params & inputs,
-    const std::optional<json> & messages_override = std::nullopt,
-    const std::optional<json> & tools_override = std::nullopt,
-    const std::optional<json> & additional_context = std::nullopt) {
-
-    auto adjusted_messages = messages_override ? *messages_override : inputs.messages;
-
-    autoparser::generation_params params = inputs;
-    params.add_generation_prompt = false;
-    params.continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
-    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params, adjusted_messages, tools_override, additional_context);
-    params.add_generation_prompt = true;
-    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params, adjusted_messages, tools_override, additional_context);
-
-    size_t prefix_len = 0;
-    size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size());
-    while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) {
-        prefix_len++;
-    }
-    return gen_prompt.substr(prefix_len);
-}
-
-std::string common_chat_template_generation_prompt(
-    const common_chat_template & tmpl,
-    const autoparser::generation_params & inputs) {
-    return common_chat_template_generation_prompt_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt);
-}
-
 static common_chat_params common_chat_params_init_ministral_3(const common_chat_template &    tmpl,
                                                              const autoparser::generation_params & inputs) {
    common_chat_params data;
@@ -929,7 +874,6 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
    data.thinking_start_tag = "[THINK]";
    data.thinking_end_tag   = "[/THINK]";
    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override = */ adjusted_messages);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override = */ adjusted_messages);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens  = {
        "[THINK]",
@@ -938,19 +882,8 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
        "[ARGS]",
    };

-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = "[THINK]" + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += "[/THINK]" + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.eps();
+        auto generation_prompt = p.prefix(inputs.generation_prompt, "[THINK]");
        auto reasoning =
            extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();

@@ -1041,7 +974,6 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
    }

    data.prompt            = prompt;
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;

@@ -1051,18 +983,6 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        "<|channel|>", "<|constrain|>", "<|message|>", "<|start|>", "<|end|>",
    };

-    // Adjust prompt for continuation
-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = "<|start|>assistant<|channel|>analysis<|message|>" + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += "<|end|><|start|>assistant<|channel|>final<|message|>" + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
    auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
    auto include_grammar     = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
@@ -1171,14 +1091,12 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);

    if (inputs.add_generation_prompt && string_ends_with(data.prompt, "<turn|>\n")) {
        // This may happen if the model generates content + tool_call, the
        // template does not add the model's next turn and confuses the model
        // from emitting its proper reasoning token sequence.
-        data.generation_prompt = "<|turn>model\n";
-        data.prompt += data.generation_prompt;
+        data.prompt += "<|turn>model\n";
    }

    data.format            = COMMON_CHAT_FORMAT_PEG_GEMMA4;
@@ -1194,25 +1112,13 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
        "<|turn>",
    };

-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = string_ends_with(data.prompt, "<turn|>\n") ? "<|turn>model\n" : "";
-        data.generation_prompt += "<|channel>thought\n" + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += "<channel|>" + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
    auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
    auto include_grammar     = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
    auto extract_reasoning   = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto start = p.rule("start", p.optional(p.literal("<|turn>model\n")));
+        auto start = p.rule("start", p.prefix(inputs.generation_prompt, "<|channel>"));

        if (extract_reasoning) {
            p.rule("thought", p.literal("<|channel>thought") + p.space() + p.reasoning(p.until("<channel|>")) + p.literal("<channel|>"));
@@ -1329,22 +1235,15 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
                                                                   const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.preserved_tokens  = {
+    data.prompt           = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.preserved_tokens = {
        ">>>all",
    };

    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;

-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-        data.generation_prompt = "<|start_header_id|>assistant<|end_header_id|>\n\n>>>all\n" + msg.render_content();
-        data.prompt += data.generation_prompt;
-    }
-
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
        // Functionary v3.2 format:
        // - Normal content: >>>all\n{content}
@@ -1356,7 +1255,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
        // When no tools, content goes until end
        auto content_until_tool = p.literal("all\n") + p.content(p.until(">>>"));
        auto content_until_end  = p.literal("all\n") + p.content(p.rest());
-        auto generation_prompt  = p.literal("<|start_header_id|>assistant<|end_header_id|>\n\n>>>");
+        auto generation_prompt  = p.literal(inputs.generation_prompt);

        // If no tools or tool_choice is NONE, just parse content
        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
@@ -1430,10 +1329,9 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
                                                          const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking = true;
+    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking  = true;
    data.preserved_tokens  = {
        "<|tool_calls_section_begin|>",
        "<|tool_calls_section_end|>",
@@ -1456,22 +1354,10 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp

    const std::string THINK_START = "<think>";
    const std::string THINK_END   = "</think>";
-    const std::string GEN_PROMPT  = "<|im_assistant|>assistant<|im_middle|>";

    data.thinking_start_tag = THINK_START;
    data.thinking_end_tag   = THINK_END;

-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += THINK_END + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
        // Kimi K2 Thinking format:
        // - Reasoning: <think>{reasoning}</think>
@@ -1491,7 +1377,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
        auto reasoning = extract_reasoning ? p.optional(THINK_START + p.reasoning(
            p.until_one_of({ THINK_END, "<|tool_calls_section_begin|>", "<|tool_call_begin|>" })) +
            p.optional(p.literal(THINK_END))) : p.eps();
-        auto generation_prompt = p.literal(GEN_PROMPT);
+        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);


        // Content only parser (no tools)
@@ -1567,7 +1453,6 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
    data.preserved_tokens  = {
@@ -1587,24 +1472,12 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    const std::string TOOL_CALL_END   = "<|tool_call_end|>";
    const std::string THINK_START     = "<think>";
    const std::string THINK_END       = "</think>";
-    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";

    data.thinking_start_tag = THINK_START;
    data.thinking_end_tag   = THINK_END;

-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += THINK_END + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.literal(GEN_PROMPT);
+        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
        auto end = p.end();

        auto reasoning = p.eps();
@@ -1659,7 +1532,6 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
    data.preserved_tokens  = {
@@ -1675,24 +1547,12 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ

    const std::string THINK_START     = "<think>";
    const std::string THINK_END       = "</think>";
-    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";

    data.thinking_start_tag = THINK_START;
    data.thinking_end_tag   = THINK_END;

-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += THINK_END + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.literal(GEN_PROMPT);
+        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
        auto end = p.end();

        auto reasoning = p.eps();
@@ -1743,7 +1603,6 @@ static common_chat_params common_chat_params_init_gigachat_v3(
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = false;
    data.preserved_tokens  = {
@@ -1751,12 +1610,6 @@ static common_chat_params common_chat_params_init_gigachat_v3(
        "<|role_sep|>\n",
    };

-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-        data.generation_prompt = "assistant<|role_sep|>\n" + msg.render_content();
-        data.prompt += data.generation_prompt;
-    }
-
    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
    const auto *tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";
@@ -1792,7 +1645,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
            ret = p.content(p.rest());
        }

-        return p.literal("assistant<|role_sep|>\n") + ret;
+        return p.literal(inputs.generation_prompt) + ret;
    });

    data.parser = parser.save();
@@ -1820,13 +1673,12 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
                                                                 const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt  = common_chat_template_generation_prompt_impl(tmpl, inputs);
-    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking  = true;
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking = true;
    data.thinking_start_tag = "<think>";
    data.thinking_end_tag   = "</think>";
-    data.preserved_tokens   = {
+    data.preserved_tokens  = {
        "｜DSML｜",
        "<think>",
        "</think>",
@@ -1846,21 +1698,9 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
    const std::string INVOKE_END   = "</" + DSML + "invoke>";
    const std::string PARAM_START  = "<" + DSML + "parameter";
    const std::string PARAM_END    = "</" + DSML + "parameter>";
-    const std::string GEN_PROMPT   = "<｜Assistant｜>";
-
-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += THINK_END + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.literal(GEN_PROMPT);
+        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
        auto end = p.end();

        auto reasoning = p.eps();
@@ -2244,7 +2084,10 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

-    if (is_lfm2_template(src)) {
+    // LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
+    // and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
+    if (src.find("<|tool_list_start|>") != std::string::npos &&
+        src.find("<|tool_list_end|>") != std::string::npos) {
        LOG_DBG("Using specialized template: LFM2\n");
        return common_chat_params_init_lfm2(tmpl, params);
    }
@@ -2293,38 +2136,16 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
    params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
    const auto & tmpl =
        params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default;
-    const auto & src             = tmpl.source();
-    const auto & caps            = tmpl.original_caps();
-    params.messages              = render_message_to_json(inputs.messages, tmpl.original_caps());
-    params.tool_choice           = inputs.tool_choice;
-    params.reasoning_format      = inputs.reasoning_format;
-    params.enable_thinking       = inputs.enable_thinking;
-    params.grammar               = inputs.grammar;
-    params.now                   = inputs.now;
-    params.add_generation_prompt = inputs.add_generation_prompt;
-    params.add_bos               = tmpls->add_bos;
-    params.add_eos               = tmpls->add_eos;
-
-    params.continue_final_message = inputs.continue_final_message;
-    if (params.continue_final_message != COMMON_CHAT_CONTINUATION_NONE) {
-        params.add_generation_prompt = false;
-
-        if (!inputs.messages.empty()) {
-            // Render messages[:-1] and store continuation message separately
-            params.continue_msg = inputs.messages.back();
-            params.messages.erase(params.messages.size() - 1);
-        }
-
-        if (params.continue_final_message == COMMON_CHAT_CONTINUATION_AUTO && !inputs.messages.empty()) {
-            // Resolve based on message content
-            params.continue_final_message = COMMON_CHAT_CONTINUATION_CONTENT;
-            if (!params.continue_msg.reasoning_content.empty() &&
-                params.continue_msg.content.empty() &&
-                params.continue_msg.content_parts.empty()) {
-                params.continue_final_message = COMMON_CHAT_CONTINUATION_REASONING;
-            }
-        }
-    }
+    const auto & src        = tmpl.source();
+    const auto & caps       = tmpl.original_caps();
+    params.messages         = render_message_to_json(inputs.messages, tmpl.original_caps());
+    params.tool_choice      = inputs.tool_choice;
+    params.reasoning_format = inputs.reasoning_format;
+    params.enable_thinking  = inputs.enable_thinking;
+    params.grammar          = inputs.grammar;
+    params.now              = inputs.now;
+    params.add_bos          = tmpls->add_bos;
+    params.add_eos          = tmpls->add_eos;

    if (src.find("<|channel|>") == std::string::npos) {
        // map developer to system for all models except for GPT-OSS
@@ -2346,6 +2167,15 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        workaround::func_args_not_string(params.messages);
    }

+    params.add_generation_prompt = false;
+    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
+    params.add_generation_prompt = true;
+    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params);
+    auto        diff             = calculate_diff_split(no_gen_prompt, gen_prompt);
+    params.generation_prompt     = diff.right + diff.suffix;
+
+    params.add_generation_prompt = inputs.add_generation_prompt;
+
    params.extra_context = common_chat_extra_context();
    for (auto el : inputs.chat_template_kwargs) {
        params.extra_context[el.first] = json::parse(el.second);
@@ -2375,16 +2205,17 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        auto params_copy               = params;
        params_copy.reasoning_format   = COMMON_REASONING_FORMAT_NONE;
        data.prompt                    = common_chat_template_direct_apply_impl(tmpl, params_copy);
-        data.generation_prompt         = common_chat_template_generation_prompt_impl(tmpl, params);
        data.format                    = COMMON_CHAT_FORMAT_PEG_NATIVE;
-        auto parser                    = build_chat_peg_parser([&data](common_chat_peg_builder &p) {
-            return p.literal(data.generation_prompt) << p.content(p.rest());
+        data.generation_prompt         = params.generation_prompt;
+        auto parser                    = build_chat_peg_parser([&params](common_chat_peg_builder &p) {
+            return p.prefix(params.generation_prompt) << p.content(p.rest());
        });
        data.parser                    = parser.save();
        return data;
    }

    if (auto result = common_chat_try_specialized_template(tmpl, src, params)) {
+        result->generation_prompt = params.generation_prompt;
        return *result;
    }

@@ -2395,9 +2226,10 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
        auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
        if (auto_params.supports_thinking) {
-            auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
-            auto_params.thinking_end_tag   = trim_whitespace(autoparser.reasoning.end);
+            auto_params.thinking_start_tag = autoparser.reasoning.start;
+            auto_params.thinking_end_tag   = autoparser.reasoning.end;
        }
+        auto_params.generation_prompt = params.generation_prompt;
        common_peg_arena arena;
        arena.load(auto_params.parser);
        LOG_DBG("%s: generated parser:\n%s\n\nparser generation prompt: %s\n", __func__, arena.dump(arena.root()).c_str(), auto_params.generation_prompt.c_str());
@@ -2564,3 +2396,4 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
    GGML_ASSERT(chat_templates->template_default != nullptr);
    return chat_templates->template_default->caps.to_map();
 }
+
--- a/common/chat.h
+++ b/common/chat.h
@@ -89,22 +89,11 @@ struct common_chat_msg {

    nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const;

-    std::string render_content(const std::string & delimiter = "\n\n") const;
-
    bool empty() const {
        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() &&
               tool_name.empty() && tool_call_id.empty();
    }

-    bool contains_media() const {
-        for (const auto & part : content_parts) {
-            if (part.type == "media_marker") {
-                return true;
-            }
-        }
-        return false;
-    }
-
    void set_tool_call_ids(std::vector<std::string> &           ids_cache,
                           const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
@@ -166,22 +155,12 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_COUNT,  // Not a format, just the # formats
 };

-
-// Continuation method provided via `continue_final_message`
-enum common_chat_continuation {
-    COMMON_CHAT_CONTINUATION_NONE,
-    COMMON_CHAT_CONTINUATION_AUTO,
-    COMMON_CHAT_CONTINUATION_REASONING,
-    COMMON_CHAT_CONTINUATION_CONTENT,
-};
-
 struct common_chat_templates_inputs {
    std::vector<common_chat_msg>          messages;
    std::string                           grammar;
    std::string                           json_schema;
-    bool                                  add_generation_prompt  = true;
-    common_chat_continuation              continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
-    bool                                  use_jinja              = true;
+    bool                                  add_generation_prompt = true;
+    bool                                  use_jinja             = true;
    // Parameters below only supported when use_jinja is true
    std::vector<common_chat_tool>         tools;
    common_chat_tool_choice               tool_choice         = COMMON_CHAT_TOOL_CHOICE_AUTO;
@@ -219,7 +198,6 @@ struct common_chat_parser_params {
    bool                    reasoning_in_content = false;
    std::string             generation_prompt;
    bool                    parse_tool_calls     = true;
-    bool                    echo                 = false;  // Include assistant prefilled msg in output
    bool                    debug                = false;  // Enable debug output for PEG parser
    common_peg_arena        parser               = {};
    common_chat_parser_params() = default;
@@ -278,15 +256,14 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
 // Parses a JSON array of messages in OpenAI's chat completion API format.
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);

-std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
-
-common_chat_continuation common_chat_continuation_parse(const nlohmann::ordered_json & value);
-
 // DEPRECATED: only used in tests
 nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);

+std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
 nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);

+nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
+
 // get template caps, useful for reporting to server /props endpoint
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);

@@ -294,19 +271,7 @@ std::string common_chat_template_direct_apply(
    const common_chat_template & tmpl,
    const autoparser::generation_params & inputs);

-std::string common_chat_template_generation_prompt(
-    const common_chat_template &          tmpl,
-    const autoparser::generation_params & inputs);
-
 std::optional<common_chat_params> common_chat_try_specialized_template(
        const common_chat_template &          tmpl,
        const std::string &                   src,
        autoparser::generation_params & params);
-
-// specialized per-task preset
-struct common_chat_prompt_preset {
-    std::string system;
-    std::string user;
-};
-
-common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -3,11 +3,9 @@

 #include "build-info.h"
 #include "common.h"
-#include "fit.h"
 #include "log.h"
 #include "llama.h"
 #include "sampling.h"
-#include "speculative.h"
 #include "unicode.h"

 #include <algorithm>
@@ -71,7 +69,7 @@ common_time_meas::~common_time_meas() {
 // CPU utils
 //

-int32_t common_cpu_get_num_physical_cores() {
+int32_t cpu_get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
@@ -186,11 +184,11 @@ static int cpu_count_math_cpus(int n_cpu) {
 /**
 * Returns number of CPUs on system that are useful for math.
 */
-int32_t common_cpu_get_num_math() {
+int32_t cpu_get_num_math() {
 #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
    int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
    if (n_cpu < 1) {
-        return common_cpu_get_num_physical_cores();
+        return cpu_get_num_physical_cores();
    }
    if (is_hybrid_cpu()) {
        cpu_set_t affinity;
@@ -203,7 +201,7 @@ int32_t common_cpu_get_num_math() {
        }
    }
 #endif
-    return common_cpu_get_num_physical_cores();
+    return cpu_get_num_physical_cores();
 }

 // Helper for setting process priority
@@ -264,7 +262,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 //


-void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model) {
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
    int32_t n_set = 0;

    if (cpuparams.n_threads < 0) {
@@ -272,7 +270,7 @@ void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_para
        if (role_model != nullptr) {
            cpuparams = *role_model;
        } else {
-            cpuparams.n_threads = common_cpu_get_num_math();
+            cpuparams.n_threads = cpu_get_num_math();
        }
    }

@@ -367,33 +365,15 @@ void common_init() {
    SetConsoleCP(CP_UTF8);
 #endif

-    common_log_set_prefix(common_log_main(), true);
-    common_log_set_timestamps(common_log_main(), true);
-
    llama_log_set(common_log_default_callback, NULL);
-}

-void common_params_print_info(const common_params & params, bool print_devices) {
 #ifdef NDEBUG
    const char * build_type = "";
 #else
    const char * build_type = " (debug)";
 #endif
-    LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);

-    LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
-
-    // device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device
-    if (print_devices) {
-        LOG_INF("device_info:\n");
-        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-            auto * dev = ggml_backend_dev_get(i);
-            size_t free, total;
-            ggml_backend_dev_memory(dev, &free, &total);
-            LOG_INF("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
-        }
-    }
-    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
 }

 std::string common_params_get_system_info(const common_params & params) {
@@ -1160,20 +1140,19 @@ struct common_init_result::impl {
    std::vector<llama_sampler_seq_config> samplers_seq_config;
 };

-common_init_result::common_init_result(common_params & params, bool model_only) :
+common_init_result::common_init_result(common_params & params) :
    pimpl(new impl{}) {
    auto mparams = common_model_params_to_llama(params);
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory ...\n", __func__);
-        LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__);
-        common_fit_params(params.model.path.c_str(), &mparams, &cparams,
+        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split,
            params.tensor_buft_overrides.data(),
            params.fit_params_target.data(),
            params.fit_params_min_ctx,
-            params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
    }

    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
@@ -1183,10 +1162,6 @@ common_init_result::common_init_result(common_params & params, bool model_only)

    pimpl->model.reset(model);

-    if (model_only) {
-        return;
-    }
-
    const llama_vocab * vocab = llama_model_get_vocab(model);

    // load and optionally apply lora adapters
@@ -1220,7 +1195,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)
    // initialize once
    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
        if (llama_vocab_is_eog(vocab, i)) {
-            LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
        }
    }
@@ -1233,12 +1208,12 @@ common_init_result::common_init_result(common_params & params, bool model_only)
    }

    //if (params.sampling.penalty_last_n == -1) {
-    //    LOG_TRC("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
    //    params.sampling.penalty_last_n = llama_n_ctx(lctx);
    //}

    //if (params.sampling.dry_penalty_last_n == -1) {
-    //    LOG_TRC("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
    //}

@@ -1290,8 +1265,8 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
    return pimpl->lora;
 }

-common_init_result_ptr common_init_from_params(common_params & params, bool model_only) {
-    common_init_result_ptr res(new common_init_result(params, model_only));
+common_init_result_ptr common_init_from_params(common_params & params) {
+    common_init_result_ptr res(new common_init_result(params));

    llama_model * model = res->model();
    if (model == NULL) {
@@ -1299,10 +1274,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
        return res;
    }

-    if (model_only) {
-        return res;
-    }
-
    llama_context * lctx = res->context();
    if (lctx == NULL) {
        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
@@ -1366,7 +1337,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
    }

    if (params.warmup) {
-        LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
+        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

        llama_set_warmup(lctx, true);

@@ -1448,15 +1419,9 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
        goto done;
    }

-    if (llama_n_rs_seq(ctx) > 0) {
-        LOG_INF("%s: the context supports bounded partial sequence removal\n", __func__);
-        res = COMMON_CONTEXT_SEQ_RM_TYPE_RS;
-        goto done;
-    }
-
    // try to remove the last tokens
    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        LOG_TRC("%s: the context does not support partial sequence removal\n", __func__);
+        LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
        res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
        goto done;
    }
@@ -1468,23 +1433,6 @@ done:
    return res;
 }

-void common_context_seq_rm(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    auto * mem = llama_get_memory(ctx);
-    if (!llama_memory_seq_rm(mem, seq_id, p0, p1)) {
-        GGML_ABORT("%s", string_format("failed to remove sequence %d with p0=%d, p1=%d\n", seq_id, p0, p1).c_str());
-    }
-}
-
-void common_context_seq_cp(llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    auto * mem = llama_get_memory(ctx);
-    llama_memory_seq_cp(mem, seq_id_src, seq_id_dst, p0, p1);
-}
-
-void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    auto * mem = llama_get_memory(ctx);
-    llama_memory_seq_add(mem, seq_id, p0, p1, delta);
-}
-
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
    std::vector<llama_adapter_lora *> loras;
    std::vector<float> scales;
@@ -1541,7 +1489,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &

    cparams.n_ctx             = params.n_ctx;
    cparams.n_seq_max         = params.n_parallel;
-    cparams.n_rs_seq          = params.speculative.need_n_rs_seq();
    cparams.n_batch           = params.n_batch;
    cparams.n_ubatch          = params.n_ubatch;
    cparams.n_threads         = params.cpuparams.n_threads;
@@ -1573,7 +1520,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    return cparams;
 }

-struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params) {
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
    struct ggml_threadpool_params tpp;

    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
@@ -2012,110 +1959,3 @@ bool common_prompt_batch_decode(

    return true;
 }
-
-size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size();
-}
-
-bool common_prompt_checkpoint::empty() const {
-    return data_tgt.empty();
-}
-
-void common_prompt_checkpoint::clear() {
-    n_tokens = 0;
-
-    pos_min = 0;
-    pos_max = 0;
-
-    data_tgt.clear();
-    data_dft.clear();
-}
-
-void common_prompt_checkpoint::update_pos(
-        int64_t n_tokens,
-        llama_pos pos_min,
-        llama_pos pos_max) {
-    this->n_tokens = n_tokens;
-    this->pos_min  = pos_min;
-    this->pos_max  = pos_max;
-}
-
-void common_prompt_checkpoint::update_tgt(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
-
-    data_tgt.resize(ckpt_size);
-
-    const size_t n = llama_state_seq_get_data_ext(ctx, data_tgt.data(), ckpt_size, seq_id, flags);
-    if (n != ckpt_size) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
-    }
-}
-
-void common_prompt_checkpoint::update_dft(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
-
-    data_dft.resize(ckpt_size);
-
-    const size_t n = llama_state_seq_get_data_ext(ctx, data_dft.data(), ckpt_size, seq_id, flags);
-    if (n != ckpt_size) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
-    }
-}
-
-void common_prompt_checkpoint::load_tgt(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) const {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    if (data_tgt.empty()) {
-        return;
-    }
-
-    const size_t n = llama_state_seq_set_data_ext(ctx, data_tgt.data(), data_tgt.size(), seq_id, flags);
-    if (n != data_tgt.size()) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_tgt.size(), n);
-    }
-}
-
-void common_prompt_checkpoint::load_dft(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) const {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    if (data_dft.empty()) {
-        return;
-    }
-
-    const size_t n = llama_state_seq_set_data_ext(ctx, data_dft.data(), data_dft.size(), seq_id, flags);
-    if (n != data_dft.size()) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n);
-    }
-}
-
-void common_prompt_checkpoint::clear_tgt() {
-    data_tgt.clear();
-}
-
-void common_prompt_checkpoint::clear_dft() {
-    data_dft.clear();
-}
--- a/common/common.h
+++ b/common/common.h
@@ -13,7 +13,6 @@
 #include <string_view>
 #include <vector>
 #include <map>
-#include <algorithm>

 #if defined(_WIN32) && !defined(_WIN32_WINNT)
 #define _WIN32_WINNT 0x0A00
@@ -55,7 +54,7 @@ struct common_control_vector_load_info;
 // CPU utils
 //

-struct common_cpu_params {
+struct cpu_params {
    int      n_threads                   = -1;
    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
    bool     mask_valid                  = false;   // Default: any CPU
@@ -64,8 +63,8 @@ struct common_cpu_params {
    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };

-int32_t common_cpu_get_num_physical_cores();
-int32_t common_cpu_get_num_math();
+int32_t cpu_get_num_physical_cores();
+int32_t cpu_get_num_math();

 //
 // Common params
@@ -158,10 +157,9 @@ enum common_params_sampling_config : uint64_t {

 enum common_speculative_type {
    COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
-    COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE,  // standalone draft model speculative decoding
-    COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3,  // Eagle3 speculative decoding
-    COMMON_SPECULATIVE_TYPE_DRAFT_MTP,     // Multi-token prediction
-    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding based on n-grams
+    COMMON_SPECULATIVE_TYPE_DRAFT,         // draft model
+    COMMON_SPECULATIVE_TYPE_EAGLE3,        // eagle draft model
+    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
@@ -276,7 +274,6 @@ struct common_params_sampling {
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
-    std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted

    bool backend_sampling = false;

@@ -297,82 +294,62 @@ struct common_params_model {
    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };

-// draft-model-based speculative decoding parameters
-struct common_params_speculative_draft {
-    int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
+struct common_ngram_mod;

-    float p_split = 0.1f; // speculative decoding split probability
-    float p_min   = 0.0f; // minimum speculative decoding probability (greedy)
+struct common_params_speculative {
+    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding

-    common_params_model mparams;
+    // general-purpose speculative decoding parameters

-    llama_context * ctx_tgt = nullptr;
-    llama_context * ctx_dft = nullptr;
+    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min   = 0;  // minimum number of draft tokens to use for speculative decoding
+    float   p_split = 0.1f; // speculative decoding split probability
+    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)

+    // ngram-based speculative decoding
+
+    uint16_t ngram_size_n   = 12; // ngram size for lookup
+    uint16_t ngram_size_m   = 48; // mgram size for speculative tokens
+    uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
+
+    std::shared_ptr<common_ngram_mod> ngram_mod;
+
+    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+
+    // draft-model speculative decoding
+
+    struct common_params_model mparams_dft;
+
+    llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
+
+    llama_context_params cparams_dft; // these are the parameters for the draft llama_context
+
+    int32_t n_ctx        = 0;  // draft context size
    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

-    common_cpu_params cpuparams;
-    common_cpu_params cpuparams_batch;
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;

    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

+    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
-};
-
-struct common_params_speculative_ngram_mod {
-    int32_t n_match = 24;
-
-    int32_t n_max = 64;
-    int32_t n_min = 48;
-};
-
-struct common_params_speculative_ngram_map {
-    uint16_t size_n   = 12; // ngram size for lookup
-    uint16_t size_m   = 48; // mgram size for speculative tokens
-    uint16_t min_hits = 1;  // minimum hits at ngram/mgram lookup for mgram to be proposed
-};
-
-struct common_params_speculative_ngram_cache {
-    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding
-    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
-};
-
-struct common_params_speculative {
-    std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };
-
-    // used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model
-    common_params_speculative_draft draft;
-
-    common_params_speculative_ngram_mod ngram_mod;
-    common_params_speculative_ngram_map ngram_simple;
-    common_params_speculative_ngram_map ngram_map_k;
-    common_params_speculative_ngram_map ngram_map_k4v;
-
-    common_params_speculative_ngram_cache ngram_cache;

    bool has_dft() const {
-        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
-    }
-
-    uint32_t need_n_rs_seq() const {
-        bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
-            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
-        });
-
-        return needs_rs_seq ? draft.n_max : 0u;
+        return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
    }
 };

 struct common_params_vocoder {
    struct common_params_model model;

-    std::string speaker_file; // speaker file path
+    std::string speaker_file = ""; // speaker file path                                      // NOLINT

-    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy
+    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };

 struct common_params_diffusion {
@@ -443,20 +420,19 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers       = -1;    // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;     // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};   // how split tensors should be distributed across GPUs
-    bool    fit_params         = true;  // whether to fit unset model/context parameters to free device memory
-    bool    fit_params_print   = false; // print the estimated required memory to run the model
-    int32_t fit_params_min_ctx = 4096;  // minimum context size to set when trying to reduce memory use
+    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
+    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
+    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
+    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use

    // margin per device in bytes for fitting parameters to free memory:
    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

-    common_cpu_params cpuparams;
-    common_cpu_params cpuparams_batch;
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
@@ -604,6 +580,8 @@ struct common_params {
    bool force_pure_content_parser = false;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
+    int reasoning_budget = -1;
+    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time

@@ -614,21 +592,11 @@ struct common_params {

    std::map<std::string, std::string> default_template_kwargs;

-    // UI configs
-#ifdef LLAMA_UI_DEFAULT_ENABLED
-    bool ui = LLAMA_UI_DEFAULT_ENABLED != 0;
-#else
-    bool ui = true; // default to enabled when not set
-#endif
-
-    // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
-    bool webui = ui;
+    // webui configs
+    bool webui = true;
    bool webui_mcp_proxy = false;
    std::string webui_config_json;

-    bool ui_mcp_proxy = false;
-    std::string ui_config_json;
-
    // "advanced" endpoints are disabled by default for better security
    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
@@ -706,12 +674,11 @@ struct common_params {
 // initializes the logging system and prints info about the build
 void common_init();

-void common_params_print_info(const common_params & params, bool print_devices = true);
 std::string common_params_get_system_info(const common_params & params);

 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
 bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
-void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr);
+void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
 bool set_process_priority(enum ggml_sched_priority prio);

 //
@@ -779,11 +746,6 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
           str.compare(0, prefix.size(), prefix) == 0;
 }

-// remove when moving to c++20
-inline bool string_starts_with(std::string_view str, char prefix) {
-    return !str.empty() && str.front() == prefix;
-}
-
 // remove when moving to c++20
 inline bool string_ends_with(std::string_view str, std::string_view suffix) {
    return str.size() >= suffix.size() &&
@@ -857,7 +819,7 @@ struct common_sampler;

 // note: defines the model, context, samplers, ets. lifetimes
 struct common_init_result {
-    common_init_result(common_params & params, bool model_only = false);
+    common_init_result(common_params & params);
    ~common_init_result();

    llama_model * model();
@@ -875,11 +837,11 @@ private:

 using common_init_result_ptr = std::unique_ptr<common_init_result>;

-common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false);
+common_init_result_ptr common_init_from_params(common_params & params);

 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
-struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
@@ -892,20 +854,15 @@ std::string common_get_model_endpoint();
 //

 enum common_context_seq_rm_type {
-    COMMON_CONTEXT_SEQ_RM_TYPE_NO           = 0, // seq_rm not supported (e.g. no memory module)
-    COMMON_CONTEXT_SEQ_RM_TYPE_PART         = 1, // can seq_rm partial sequences
-    COMMON_CONTEXT_SEQ_RM_TYPE_FULL         = 2, // can seq_rm full sequences only
-    COMMON_CONTEXT_SEQ_RM_TYPE_RS = 3, // can seq_rm partial sequences, bounded by n_rs_seq
+    COMMON_CONTEXT_SEQ_RM_TYPE_NO   = 0, // seq_rm not supported (e.g. no memory module)
+    COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences
+    COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only
 };

 // check if the llama_context can remove sequences
 // note: clears the memory of the context
 common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);

-// aborts execution on failure
-void common_context_seq_rm (llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1);
-void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
-void common_context_seq_cp (llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);

 //
 // Batch utils
@@ -1044,50 +1001,3 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std

 // "adamw" or "sgd" (case insensitive)
 enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
-
-//
-// prompt utils
-//
-
-struct common_prompt_checkpoint {
-    int64_t n_tokens;
-
-    llama_pos pos_min;
-    llama_pos pos_max;
-
-    std::vector<uint8_t> data_tgt;
-    std::vector<uint8_t> data_dft;
-
-    size_t size() const;
-
-    bool empty() const;
-    void clear();
-
-    void update_pos(
-            int64_t n_tokens,
-            llama_pos pos_min,
-            llama_pos pos_max);
-
-    void update_tgt(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags);
-
-    void update_dft(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags);
-
-    void load_tgt(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags) const;
-
-    void load_dft(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags) const;
-
-    void clear_tgt();
-    void clear_dft();
-};
--- a/common/debug.cpp
+++ b/common/debug.cpp
@@ -1,38 +1,9 @@
 #include "debug.h"

-#include "common.h"
 #include "log.h"

 #include <cmath>
-#include <regex>
 #include <string>
-#include <vector>
-
-struct common_debug_cb_user_data::impl {
-    std::vector<uint8_t>    data;
-    std::vector<std::regex> tensor_filters;
-    bool                    abort_on_nan{false};
-};
-
-common_debug_cb_user_data::common_debug_cb_user_data() : pimpl(std::make_unique<impl>()) {}
-common_debug_cb_user_data::~common_debug_cb_user_data() = default;
-
-common_debug_cb_user_data::common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan)
-    : pimpl(std::make_unique<impl>())
-{
-    for (const auto & pattern : filter_patterns) {
-        try {
-            std::string anchored_pattern = "^" + pattern;
-            pimpl->tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
-        } catch (const std::regex_error & e) {
-            throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
-        }
-    }
-    pimpl->abort_on_nan = abort_on_nan;
-
-    params.cb_eval           = common_debug_cb_eval;
-    params.cb_eval_user_data = this;
-}

 static std::string common_ggml_ne_string(const ggml_tensor * t) {
    std::string str;
@@ -76,7 +47,8 @@ static float common_ggml_get_float_value(const uint8_t * data,

 #define INDENT "    "

-static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n, bool abort_on_nan) {
+template <bool abort>
+void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
@@ -122,7 +94,7 @@ static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int6
        LOG(INDENT "sum = %f\n", sum);
    }

-    if (abort_on_nan) {
+    if constexpr (abort) {
        if (std::isnan(sum)) {
            LOG("encountered NaN - aborting\n");
            exit(0);
@@ -140,9 +112,8 @@ static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int6
 * @param user_data user data to pass at each call back
 * @return true to receive data or continue the graph, false otherwise
 */
-bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (common_debug_cb_user_data *) user_data;
-    auto * pimpl = cb_data->pimpl.get();
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (base_callback_data *) user_data;

    const struct ggml_tensor * src0 = t->src[0];
    const struct ggml_tensor * src1 = t->src[1];
@@ -151,10 +122,10 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
        return true;  // Always retrieve data
    }

-    bool matches_filter = pimpl->tensor_filters.empty();
+    bool matches_filter = cb_data->tensor_filters.empty();

    if (!matches_filter) {
-        for (const auto & filter : pimpl->tensor_filters) {
+        for (const auto & filter : cb_data->tensor_filters) {
            if (std::regex_search(t->name, filter)) {
                matches_filter = true;
                break;
@@ -177,14 +148,20 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {

    if (!is_host) {
        auto n_bytes = ggml_nbytes(t);
-        pimpl->data.resize(n_bytes);
-        ggml_backend_tensor_get(t, pimpl->data.data(), 0, n_bytes);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
    }

    if (!ggml_is_quantized(t->type) && matches_filter) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : pimpl->data.data();
-        common_debug_print_tensor(data, t->type, t->ne, t->nb, 3, pimpl->abort_on_nan);
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
    }

    return true;
 }
+
+// Explicit template instantiations
+template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
+template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
+template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
+template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
--- a/common/debug.h
+++ b/common/debug.h
@@ -1,31 +1,43 @@
 #pragma once
-
-#include <memory>
+#include "common.h"
 #include <string>
 #include <vector>
+#include <regex>

 // common debug functions and structs

-struct common_params;
+// Print a tensor's detailed data
+// data - the tensor's data in byte format
+// type - the tensor's quantization type
+// ne   - the tensor dimensions array
+// nb   - the tensor strides array
+// n    - the number of rows/columns to fully print
+template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);

 // Intended to use as callback for ggml_backend_sched_eval_callback
 // prints tensors that are processed in the computation graph
-// by default prints all tensors, but can be configured by creating a `common_debug_cb_user_data` instance with
-// non-empty filter_patterns. See examples/debug.cpp for possible usage patterns
-// `common_debug_cb_user_data` contains `abort_on_nan` flag that determines whether an error should be thrown whenever a NaN is encountered
+// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
+// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
+// The template parameter determines whether an error should be thrown whenever a NaN is encountered
 // in a tensor (useful for stopping debug sessions on first erroneous tensor)
 // The callback data will be passed as the third parameter (user_data)
-bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
+struct base_callback_data {
+    std::vector<uint8_t>    data;
+    std::vector<std::regex> tensor_filters;

-struct common_debug_cb_user_data {
-    struct impl;
-    std::unique_ptr<impl> pimpl;
+    base_callback_data() = default;

-    common_debug_cb_user_data();
-    ~common_debug_cb_user_data();
-
-    common_debug_cb_user_data(const common_debug_cb_user_data &) = delete;
-    common_debug_cb_user_data & operator=(const common_debug_cb_user_data &) = delete;
-
-    common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan = false);
+    base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
+        for (const auto & pattern : filter_patterns) {
+            try {
+                std::string anchored_pattern = "^" + pattern;
+                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
+            } catch (const std::regex_error & e) {
+                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
+            }
+        }
+        params.cb_eval           = common_debug_cb_eval<false>;
+        params.cb_eval_user_data = this;
+    }
 };
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -320,9 +320,9 @@ static int common_download_file_single_online(const std::string & url,

    auto head = cli.Head(parts.path);
    if (!head || head->status < 200 || head->status >= 300) {
-        LOG_TRC("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
+        LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
        if (file_exists) {
-            LOG_TRC("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
+            LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        return head ? head->status : -1;
@@ -566,11 +566,8 @@ static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files,
    return result;
 }

-// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "mtp"),
-// preferring deeper shared directory prefix with the model, then closest quantization
-static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files,
-                                           const std::string        & model,
-                                           const std::string        & keyword) {
+static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
+                                          const std::string        & model) {
    hf_cache::hf_file best;
    size_t best_depth = 0;
    int best_diff = 0;
@@ -582,20 +579,20 @@ static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files,

    for (const auto & f : files) {
        if (!string_ends_with(f.path, ".gguf") ||
-            f.path.find(keyword) == std::string::npos) {
+            f.path.find("mmproj") == std::string::npos) {
            continue;
        }

-        auto sib_parts = string_split<std::string>(f.path, '/');
-        auto sib_dir = sib_parts.end() - 1;
+        auto mmproj_parts = string_split<std::string>(f.path, '/');
+        auto mmproj_dir = mmproj_parts.end() - 1;

        auto [_, dir] = std::mismatch(model_parts.begin(), model_dir,
-                                      sib_parts.begin(), sib_dir);
-        if (dir != sib_dir) {
+                                      mmproj_parts.begin(), mmproj_dir);
+        if (dir != mmproj_dir) {
            continue;
        }

-        size_t depth = dir - sib_parts.begin();
+        size_t depth = dir - mmproj_parts.begin();
        auto bits = extract_quant_bits(f.path);
        auto diff = std::abs(bits - model_bits);

@@ -609,16 +606,6 @@ static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files,
    return best;
 }

-static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
-                                          const std::string        & model) {
-    return find_best_sibling(files, model, "mmproj");
-}
-
-static hf_cache::hf_file find_best_mtp(const hf_cache::hf_files & files,
-                                       const std::string        & model) {
-    return find_best_sibling(files, model, "mtp-");
-}
-
 static bool gguf_filename_is_model(const std::string & filepath) {
    if (!string_ends_with(filepath, ".gguf")) {
        return false;
@@ -630,8 +617,7 @@ static bool gguf_filename_is_model(const std::string & filepath) {
    }

    return filename.find("mmproj")  == std::string::npos &&
-           filename.find("imatrix") == std::string::npos &&
-           filename.find("mtp-")    == std::string::npos;
+           filename.find("imatrix") == std::string::npos;
 }

 static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
@@ -641,7 +627,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
    if (!tag.empty()) {
        tags.push_back(tag);
    } else {
-        tags = {"Q4_K_M", "Q8_0"};
+        tags = {"Q4_K_M", "Q4_0"};
    }

    for (const auto & t : tags) {
@@ -687,13 +673,11 @@ struct hf_plan {
    hf_cache::hf_file primary;
    hf_cache::hf_files model_files;
    hf_cache::hf_file mmproj;
-    hf_cache::hf_file mtp;
 };

 static hf_plan get_hf_plan(const common_params_model  & model,
                           const common_download_opts & opts,
-                           bool download_mmproj,
-                           bool download_mtp) {
+                           bool download_mmproj) {
    hf_plan plan;
    hf_cache::hf_files all;

@@ -739,10 +723,6 @@ static hf_plan get_hf_plan(const common_params_model  & model,
        plan.mmproj = find_best_mmproj(all, primary.path);
    }

-    if (download_mtp) {
-        plan.mtp = find_best_mtp(all, primary.path);
-    }
-
    return plan;
 }

@@ -776,8 +756,7 @@ static std::vector<download_task> get_url_tasks(const common_params_model & mode

 common_download_model_result common_download_model(const common_params_model  & model,
                                                   const common_download_opts & opts,
-                                                   bool download_mmproj,
-                                                   bool download_mtp) {
+                                                   bool download_mmproj) {
    common_download_model_result result;
    std::vector<download_task> tasks;
    hf_plan hf;
@@ -785,16 +764,13 @@ common_download_model_result common_download_model(const common_params_model  &
    bool is_hf = !model.hf_repo.empty();

    if (is_hf) {
-        hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
+        hf = get_hf_plan(model, opts, download_mmproj);
        for (const auto & f : hf.model_files) {
            tasks.push_back({f.url, f.local_path});
        }
        if (!hf.mmproj.path.empty()) {
            tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
        }
-        if (!hf.mtp.path.empty()) {
-            tasks.push_back({hf.mtp.url, hf.mtp.local_path});
-        }
    } else if (!model.url.empty()) {
        tasks = get_url_tasks(model);
    } else {
@@ -831,10 +807,6 @@ common_download_model_result common_download_model(const common_params_model  &
        if (!hf.mmproj.path.empty()) {
            result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
        }
-
-        if (!hf.mtp.path.empty()) {
-            result.mtp_path = hf_cache::finalize_file(hf.mtp);
-        }
    } else {
        result.model_path = model.path;
    }
@@ -974,8 +946,7 @@ std::vector<common_cached_model_info> common_list_cached_models() {
    for (const auto & f : files) {
        auto split = get_gguf_split_info(f.path);
        if (split.index != 1 || split.tag.empty() ||
-            split.prefix.find("mmproj") != std::string::npos ||
-            split.prefix.find("mtp-")   != std::string::npos) {
+            split.prefix.find("mmproj") != std::string::npos) {
            continue;
        }
        if (seen.insert(f.repo_id + ":" + split.tag).second) {
--- a/common/download.h
+++ b/common/download.h
@@ -59,7 +59,6 @@ struct common_download_opts {
 struct common_download_model_result {
    std::string model_path;
    std::string mmproj_path;
-    std::string mtp_path;
 };

 // Download model from HuggingFace repo or URL
@@ -84,14 +83,12 @@ struct common_download_model_result {
 // when opts.offline=true, no network requests are made
 // when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
 // then with the closest quantization bits
-// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
 //
-// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
+// returns result with model_path and mmproj_path (empty on failure)
 common_download_model_result common_download_model(
    const common_params_model & model,
    const common_download_opts & opts = {},
-    bool download_mmproj = false,
-    bool download_mtp    = false
+    bool download_mmproj = false
 );

 // returns list of cached models
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -1,959 +0,0 @@
-#include "fit.h"
-
-#include "log.h"
-
-#include "../src/llama-ext.h"
-
-#include <array>
-#include <cassert>
-#include <stdexcept>
-#include <cinttypes>
-#include <set>
-#include <string>
-#include <vector>
-
-// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
-// enum to identify part of a layer for distributing its tensors:
-enum common_layer_fraction_t {
-    LAYER_FRACTION_NONE = 0, // nothing
-    LAYER_FRACTION_ATTN = 1, // attention
-    LAYER_FRACTION_UP   = 2, // attention + up
-    LAYER_FRACTION_GATE = 3, // attention + up + gate
-    LAYER_FRACTION_MOE  = 4, // everything but sparse MoE weights
-};
-
-class common_params_fit_exception : public std::runtime_error {
-    using std::runtime_error::runtime_error;
-};
-
-static std::vector<llama_device_memory_data> common_get_device_memory_data(
-        const char * path_model,
-        const llama_model_params * mparams,
-        const llama_context_params * cparams,
-        std::vector<ggml_backend_dev_t> & devs,
-        uint32_t & hp_ngl,
-        uint32_t & hp_n_ctx_train,
-        uint32_t & hp_n_expert,
-        ggml_log_level log_level) {
-    struct user_data_t {
-        struct {
-            ggml_log_callback callback;
-            void * user_data;
-        } original_logger;
-        ggml_log_level min_level; // prints below this log level go to debug log
-    };
-    user_data_t ud;
-    llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
-    ud.min_level = log_level;
-
-    llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
-        const user_data_t * ud = (const user_data_t *) user_data;
-        const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
-        ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
-    }, &ud);
-
-    llama_model_params mparams_copy = *mparams;
-    mparams_copy.no_alloc  = true;
-    mparams_copy.use_mmap  = false;
-    mparams_copy.use_mlock = false;
-
-    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
-    if (model == nullptr) {
-        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
-        throw std::runtime_error("failed to load model");
-    }
-
-    llama_context * ctx = llama_init_from_model(model, *cparams);
-    if (ctx == nullptr) {
-        llama_model_free(model);
-        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
-        throw std::runtime_error("failed to create llama_context from model");
-    }
-
-    const size_t nd = llama_model_n_devices(model);
-    std::vector<llama_device_memory_data> ret(nd + 1);
-
-    llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
-
-    for (const auto & [buft, mb] : memory_breakdown) {
-        if (ggml_backend_buft_is_host(buft)) {
-            ret.back().mb.model   += mb.model;
-            ret.back().mb.context += mb.context;
-            ret.back().mb.compute += mb.compute;
-            continue;
-        }
-
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
-        if (!dev) {
-            continue;
-        }
-        for (size_t i = 0; i < nd; i++) {
-            if (dev == llama_model_get_device(model, i)) {
-                ret[i].mb.model   += mb.model;
-                ret[i].mb.context += mb.context;
-                ret[i].mb.compute += mb.compute;
-                break;
-            }
-        }
-    }
-
-    {
-        ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-        if (cpu_dev == nullptr) {
-            throw std::runtime_error("no CPU backend found");
-        }
-        size_t free;
-        size_t total;
-        ggml_backend_dev_memory(cpu_dev, &free, &total);
-        ret.back().free  = free;
-        ret.back().total = total;
-    }
-    for (size_t i = 0; i < nd; i++) {
-        ggml_backend_dev_t dev = llama_model_get_device(model, i);
-
-        size_t free;
-        size_t total;
-        ggml_backend_dev_memory(dev, &free, &total);
-
-        // Some non-GPU accelerator backends, such as BLAS, report 0/0 and rely on
-        // the host-memory fallback. For GPU-like backends, keep 0/0 so --fit does
-        // not assign anything to a device with an unknown memory budget.
-        if (free == 0 && total == 0) {
-            const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev);
-            if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
-                LOG_WRN("%s: device %s did not report memory; --fit will not use it\n",
-                        __func__, ggml_backend_dev_name(dev));
-            } else {
-                free  = ret.back().free;
-                total = ret.back().total;
-            }
-        }
-        ret[i].free  = free;
-        ret[i].total = total;
-    }
-
-    devs.clear();
-    for (int i = 0; i < llama_model_n_devices(model); i++) {
-        devs.push_back(llama_model_get_device(model, i));
-    }
-
-    hp_ngl         = llama_model_n_layer(model);
-    hp_n_ctx_train = llama_model_n_ctx_train(model);
-    hp_n_expert    = llama_model_n_expert(model);
-
-    common_memory_breakdown_print(ctx);
-
-    llama_free(ctx);
-    llama_model_free(model);
-    llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
-
-    return ret;
-}
-
-static void common_params_fit_impl(
-        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
-        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
-    if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
-        throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
-    }
-    constexpr int64_t MiB = 1024*1024;
-    typedef std::vector<llama_device_memory_data> dmds_t;
-    const llama_model_params default_mparams = llama_model_default_params();
-
-    std::vector<ggml_backend_dev_t> devs;
-    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
-    uint32_t hp_nct = 0; // hparams.n_ctx_train
-    uint32_t hp_nex = 0; // hparams.n_expert
-
-    // step 1: get data for default parameters and check whether any changes are necessary in the first place
-
-    LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
-    const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-    const size_t nd = devs.size(); // number of devices
-
-    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
-    margins.reserve(nd);
-    if (nd == 0) {
-        margins.push_back(margins_s[0]);
-    } else {
-        for (size_t id = 0; id < nd; id++) {
-            margins.push_back(margins_s[id]);
-        }
-    }
-
-    std::vector<std::string> dev_names;
-    {
-        dev_names.reserve(nd);
-        size_t max_length = 0;
-        for (const auto & dev : devs) {
-            std::string name = ggml_backend_dev_name(dev);
-            name += " (";
-            name += ggml_backend_dev_description(dev);
-            name += ")";
-            dev_names.push_back(name);
-            max_length = std::max(max_length, name.length());
-        }
-        for (std::string & dn : dev_names) {
-            dn.insert(dn.end(), max_length - dn.length(), ' ');
-        }
-    }
-
-    int64_t sum_free            = 0;
-    int64_t sum_projected_free  = 0;
-    int64_t sum_projected_used  = 0;
-    int64_t sum_projected_model = 0;
-    std::vector<int64_t> projected_free_per_device;
-    projected_free_per_device.reserve(nd);
-
-    if (nd == 0) {
-        sum_projected_used = dmds_full.back().mb.total();
-        sum_free           = dmds_full.back().total;
-        sum_projected_free = sum_free - sum_projected_used;
-        LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
-            __func__, sum_projected_used/MiB, sum_free/MiB);
-        if (sum_projected_free >= margins[0]) {
-            LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
-                __func__, sum_projected_free/MiB, margins[0]/MiB);
-            return;
-        }
-    } else {
-        if (nd > 1) {
-            LOG_TRC("%s: projected memory use with initial parameters [MiB]:\n", __func__);
-        }
-        for (size_t id = 0; id < nd; id++) {
-            const llama_device_memory_data & dmd = dmds_full[id];
-
-            const int64_t projected_used = dmd.mb.total();
-            const int64_t projected_free = dmd.free - projected_used;
-            projected_free_per_device.push_back(projected_free);
-
-            sum_free            += dmd.free;
-            sum_projected_used  += projected_used;
-            sum_projected_free  += projected_free;
-            sum_projected_model += dmd.mb.model;
-
-            if (nd > 1) {
-                LOG_TRC("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
-                    __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
-            }
-        }
-        assert(sum_free >= 0 && sum_projected_used >= 0);
-        LOG_TRC("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
-            __func__, sum_projected_used/MiB, sum_free/MiB);
-        if (nd == 1) {
-            if (projected_free_per_device[0] >= margins[0]) {
-                LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                    __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
-                return;
-            }
-        } else {
-            bool changes_needed = false;
-            for (size_t id = 0; id < nd; id++) {
-                if (projected_free_per_device[id] < margins[id]) {
-                    changes_needed = true;
-                    break;
-                }
-            }
-            if (!changes_needed) {
-                LOG_TRC("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
-                return;
-            }
-        }
-    }
-
-    // step 2: try reducing memory use by reducing the context size
-
-    {
-        int64_t global_surplus = sum_projected_free;
-        if (nd == 0) {
-            global_surplus -= margins[0];
-        } else {
-            for (size_t id = 0; id < nd; id++) {
-                global_surplus -= margins[id];
-            }
-        }
-        if (global_surplus < 0) {
-            if (nd <= 1) {
-                LOG_TRC("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
-                    __func__, margins[0]/MiB, -global_surplus/MiB);
-            } else {
-                LOG_TRC(
-                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
-                    __func__, -global_surplus/MiB);
-            }
-            if (cparams->n_ctx == 0) {
-                if (hp_nct > n_ctx_min) {
-                    int64_t sum_used_target = sum_free;
-                    if (nd == 0) {
-                        sum_used_target -= margins[0];
-                    } else {
-                        for (size_t id = 0; id < nd; id++) {
-                            sum_used_target -= margins[id];
-                        }
-                    }
-                    if (nd > 1) {
-                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
-                        //   - for dense models only whole layers can be assigned to devices
-                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
-                        //   - on average we expect a waste of 0.5 layers/tensors per device
-                        //   - use slightly more than the expected average for nd devices to be safe
-                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
-                        sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
-                    }
-
-                    int64_t sum_projected_used_min_ctx = 0;
-                    cparams->n_ctx = n_ctx_min;
-                    const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-                    if (nd == 0) {
-                        sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
-                    } else {
-                        for (size_t id = 0; id < nd; id++) {
-                            sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
-                        }
-                    }
-                    if (sum_used_target > sum_projected_used_min_ctx) {
-                        // linear interpolation between minimum and maximum context size:
-                        cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
-                            / (sum_projected_used - sum_projected_used_min_ctx);
-                        cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
-
-                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
-                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
-                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
-                        if (nd <= 1) {
-                            LOG_TRC("%s: entire model can be fit by reducing context\n", __func__);
-                            return;
-                        }
-                        LOG_TRC("%s: entire model should be fit across devices by reducing context\n", __func__);
-                    } else {
-                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
-                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
-                    }
-                } else {
-                    if (n_ctx_min == UINT32_MAX) {
-                        LOG_TRC("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
-                    } else {
-                        LOG_TRC("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
-                            __func__, hp_nct, n_ctx_min);
-                    }
-                }
-            } else {
-                LOG_TRC("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
-            }
-        }
-    }
-    if (nd == 0) {
-        throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
-    }
-
-    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
-        throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
-    }
-    if (nd > 1) {
-        if (!tensor_split) {
-            throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
-        }
-        if (mparams->tensor_split) {
-            for (size_t id = 0; id < nd; id++) {
-                if (mparams->tensor_split[id] != 0.0f) {
-                    throw common_params_fit_exception("model_params::tensor_split already set by user, abort");
-                }
-            }
-        }
-        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
-        }
-    }
-    if (!tensor_buft_overrides) {
-        throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
-    }
-    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
-        throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
-    }
-
-    // step 3: iteratively fill the back to front with "dense" layers
-    //   - for a dense model simply fill full layers, giving each device a contiguous slice of the model
-    //   - for a MoE model, same as dense model but with all MoE tensors in system memory
-
-    // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
-    auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * {
-        constexpr size_t n_strings = 1000;
-        if (il >= n_strings) {
-            throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
-        }
-        switch (lf) {
-            case LAYER_FRACTION_ATTN: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
-                }
-                return patterns[il].c_str();
-            }
-            case LAYER_FRACTION_UP: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
-                }
-                return patterns[il].c_str();
-            }
-            case LAYER_FRACTION_GATE: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
-                }
-                return patterns[il].c_str();
-            }
-            case LAYER_FRACTION_MOE: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
-                }
-                return patterns[il].c_str();
-            }
-            default:
-                GGML_ABORT("fatal error");
-        }
-    };
-
-    struct ngl_t {
-        uint32_t n_layer = 0; // number of total layers
-        uint32_t n_part  = 0; // number of partial layers, <= n_layer
-
-        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
-        common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
-
-        uint32_t n_full() const {
-            assert(n_layer >= n_part);
-            return n_layer - n_part;
-        }
-    };
-
-    const size_t ntbo = llama_max_tensor_buft_overrides();
-
-    // utility function to set n_gpu_layers and tensor_split
-    auto set_ngl_tensor_split_tbo = [&](
-            const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            llama_model_params & mparams) {
-        mparams.n_gpu_layers = 0;
-        for (size_t id = 0; id < nd; id++) {
-            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
-            if (nd > 1) {
-                tensor_split[id] = ngl_per_device[id].n_layer;
-            }
-        }
-        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
-        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
-
-        mparams.tensor_split = tensor_split;
-
-        size_t itbo = 0;
-        for (size_t id = 0; id < nd; id++) {
-            il0 += ngl_per_device[id].n_full();
-            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
-                if (itbo + 1 >= ntbo) {
-                    tensor_buft_overrides[itbo].pattern = nullptr;
-                    tensor_buft_overrides[itbo].buft    = nullptr;
-                    itbo++;
-                    mparams.tensor_buft_overrides = tensor_buft_overrides;
-                    throw common_params_fit_exception("llama_max_tensor_buft_overrides() == "
-                        + std::to_string(ntbo) + " is insufficient for model");
-                }
-                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
-                tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
-                itbo++;
-            }
-            il0 += ngl_per_device[id].n_part;
-        }
-        tensor_buft_overrides[itbo].pattern = nullptr;
-        tensor_buft_overrides[itbo].buft    = nullptr;
-        itbo++;
-        mparams.tensor_buft_overrides = tensor_buft_overrides;
-    };
-
-    // utility function that returns the memory use per device for given numbers of layers per device
-    auto get_memory_for_layers = [&](
-            const char * func_name,
-            const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
-        llama_model_params mparams_copy = *mparams;
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
-
-        const dmds_t dmd_nl = common_get_device_memory_data(
-            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-
-        LOG_TRC("%s: memory for test allocation by device:\n", func_name);
-        for (size_t id = 0; id < nd; id++) {
-            const ngl_t & n = ngl_per_device[id];
-            LOG_TRC(
-                "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
-                func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
-        }
-
-        std::vector<int64_t> ret;
-        ret.reserve(nd);
-        for (size_t id = 0; id < nd; id++) {
-            ret.push_back(dmd_nl[id].mb.total());
-        }
-        return ret;
-    };
-
-    int64_t global_surplus_cpu_moe = 0;
-    if (hp_nex > 0) {
-        const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
-        ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
-        tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
-        tensor_buft_overrides[1] = {nullptr, nullptr};
-        mparams->tensor_buft_overrides = tensor_buft_overrides;
-
-        LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
-        const dmds_t dmds_cpu_moe = common_get_device_memory_data(
-            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-
-        for (size_t id = 0; id < nd; id++) {
-            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
-            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
-        }
-
-        if (global_surplus_cpu_moe > 0) {
-            LOG_TRC("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
-                __func__, global_surplus_cpu_moe/MiB);
-        } else {
-            LOG_TRC("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
-                __func__, -global_surplus_cpu_moe/MiB);
-        }
-
-        // reset
-        tensor_buft_overrides[0] = {nullptr, nullptr};
-        mparams->tensor_buft_overrides = tensor_buft_overrides;
-    }
-
-    std::vector<int64_t> targets; // maximum acceptable memory use per device
-    targets.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        targets.push_back(dmds_full[id].free - margins[id]);
-        LOG_TRC("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
-    }
-
-    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
-    overflow_bufts.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
-    }
-
-    std::vector<ngl_t> ngl_per_device(nd);
-    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
-
-    // optimize the number of layers per device using the method of false position:
-    //   - ngl_per_device has 0 layers for each device, lower bound
-    //   - try a "high" configuration where a device is given all unassigned layers
-    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
-    //   - check memory use of our guess, replace either the low or high bound
-    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
-    //   - the last device has the output layer, which cannot be a partial layer
-    if (hp_nex == 0) {
-        LOG_TRC("%s: filling dense layers back-to-front:\n", __func__);
-    } else {
-        LOG_TRC("%s: filling dense-only layers back-to-front:\n", __func__);
-    }
-    for (int id = nd - 1; id >= 0; id--) {
-        uint32_t n_unassigned = hp_ngl + 1;
-        for (size_t jd = id + 1; jd < nd; ++jd) {
-            assert(n_unassigned >= ngl_per_device[jd].n_layer);
-            n_unassigned -= ngl_per_device[jd].n_layer;
-        }
-
-        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
-        ngl_per_device_high[id].n_layer = n_unassigned;
-        if (hp_nex > 0) {
-            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
-        }
-        if (ngl_per_device_high[id].n_layer > 0) {
-            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
-            if (mem_high[id] > targets[id]) {
-                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
-                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                LOG_TRC("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
-                while (delta > 1) {
-                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
-                    step_size = std::max(step_size, uint32_t(1));
-                    step_size = std::min(step_size, delta - 1);
-
-                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
-                    ngl_per_device_test[id].n_layer += step_size;
-                    if (hp_nex) {
-                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
-                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
-                    }
-                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
-
-                    if (mem_test[id] <= targets[id]) {
-                        ngl_per_device = ngl_per_device_test;
-                        mem            = mem_test;
-                        LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
-                    } else {
-                        ngl_per_device_high = ngl_per_device_test;
-                        mem_high            = mem_test;
-                        LOG_TRC("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
-                    }
-                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                }
-            } else {
-                assert(ngl_per_device_high[id].n_layer == n_unassigned);
-                ngl_per_device = ngl_per_device_high;
-                mem            = mem_high;
-                LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
-            }
-        }
-
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_TRC(
-            "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
-    }
-    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
-        return;
-    }
-
-    // step 4: for a MoE model where all dense tensors fit,
-    //     convert the dense-only layers in the back to full layers in the front until all devices are full
-    // essentially the same procedure as for the dense-only layers except front-to-back
-    // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
-
-    size_t id_dense_start = nd;
-    for (int id = nd - 1; id >= 0; id--) {
-        if (ngl_per_device[id].n_layer > 0) {
-            id_dense_start = id;
-            continue;
-        }
-        break;
-    }
-    assert(id_dense_start < nd);
-
-    LOG_TRC("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
-    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
-        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
-        for (size_t jd = id_dense_start; jd < nd; jd++) {
-            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
-            ngl_per_device_high[id].n_layer += n_layer_move;
-            ngl_per_device_high[jd].n_layer -= n_layer_move;
-            ngl_per_device_high[jd].n_part = 0;
-        }
-        size_t id_dense_start_high = nd - 1;
-        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
-
-        if (mem_high[id] > targets[id]) {
-            assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-            uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
-            while (delta > 1) {
-                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
-                step_size = std::max(step_size, uint32_t(1));
-                step_size = std::min(step_size, delta - 1);
-
-                std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
-                size_t id_dense_start_test = id_dense_start;
-                uint32_t n_converted_test = 0;
-                for (;id_dense_start_test < nd; id_dense_start_test++) {
-                    const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
-                    ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
-                    ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
-                    ngl_per_device_test[id].n_layer += n_convert_jd;
-                    n_converted_test += n_convert_jd;
-
-                    if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
-                        break;
-                    }
-                }
-                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
-
-                if (mem_test[id] <= targets[id]) {
-                    ngl_per_device = ngl_per_device_test;
-                    mem            = mem_test;
-                    id_dense_start = id_dense_start_test;
-                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
-                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-                } else {
-                    ngl_per_device_high = ngl_per_device_test;
-                    mem_high            = mem_test;
-                    id_dense_start_high = id_dense_start_test;
-                    LOG_TRC("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
-                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
-                }
-                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-                delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
-            }
-        } else {
-            ngl_per_device = ngl_per_device_high;
-            mem            = mem_high;
-            id_dense_start = id_dense_start_high;
-            LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
-                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-        }
-
-        // try to fit at least part of one more layer
-        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
-            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
-            size_t id_dense_start_test = id_dense_start;
-            ngl_per_device_test[id_dense_start_test].n_layer--;
-            ngl_per_device_test[id_dense_start_test].n_part--;
-            ngl_per_device_test[id].n_layer++;
-            ngl_per_device_test[id].n_part++;
-            if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
-                id_dense_start_test++;
-            }
-            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
-            std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
-            if (id < nd - 1) {
-                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
-            }
-            LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
-            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
-                ngl_per_device = ngl_per_device_test;
-                overflow_bufts = overflow_bufts_test;
-                mem            = mem_test;
-                id_dense_start = id_dense_start_test;
-                LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
-                    __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-
-                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
-                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
-                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
-                    mem            = mem_test;
-                    id_dense_start = id_dense_start_test;
-                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
-                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-                }
-            } else {
-                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
-                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
-                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
-                    mem            = mem_test;
-                    id_dense_start = id_dense_start_test;
-                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
-                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-                }
-            }
-        }
-
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_TRC(
-            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
-    }
-
-    // print info for devices that were not changed during the conversion from dense only to full layers:
-    for (size_t id = id_dense_start + 1; id < nd; id++) {
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_TRC(
-            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
-    }
-
-    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
-}
-
-enum common_params_fit_status common_fit_params(
-        const char * path_model,
-        llama_model_params * mparams,
-        llama_context_params * cparams,
-        float * tensor_split,
-        llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins,
-        uint32_t n_ctx_min,
-        ggml_log_level log_level) {
-    const int64_t t0_us = llama_time_us();
-    common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
-    try {
-        common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
-        LOG_TRC("%s: successfully fit params to free device memory\n", __func__);
-    } catch (const common_params_fit_exception & e) {
-        LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
-        status = COMMON_PARAMS_FIT_STATUS_FAILURE;
-    } catch (const std::runtime_error & e) {
-        LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
-        status = COMMON_PARAMS_FIT_STATUS_ERROR;
-    }
-    const int64_t t1_us = llama_time_us();
-    LOG_TRC("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
-    return status;
-}
-
-void common_memory_breakdown_print(const struct llama_context * ctx) {
-    //const auto & devices = ctx->get_model().devices;
-    const auto * model = llama_get_model(ctx);
-
-    std::vector<ggml_backend_dev_t> devices;
-    for (int i = 0; i < llama_model_n_devices(model); i++) {
-        devices.push_back(llama_model_get_device(model, i));
-    }
-
-    llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
-
-    std::vector<std::array<std::string, 9>> table_data;
-    table_data.reserve(devices.size());
-    const std::string template_header = "%s: | %s | %s   %s    %s   %s   %s   %s    %s |\n";
-    const std::string template_gpu    = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
-    const std::string template_other  = "%s: | %s | %s   %s    %s = %s + %s + %s    %s |\n";
-
-    table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
-
-    constexpr size_t MiB = 1024 * 1024;
-    const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
-
-    // track seen buffer types to avoid double counting:
-    std::set<ggml_backend_buffer_type_t> seen_buffer_types;
-
-    // accumulative memory breakdown for each device and for host:
-    std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
-    llama_memory_breakdown_data              mb_host;
-
-    for (const auto & buft_mb : memory_breakdown) {
-        ggml_backend_buffer_type_t          buft = buft_mb.first;
-        const llama_memory_breakdown_data & mb   = buft_mb.second;
-        if (ggml_backend_buft_is_host(buft)) {
-            mb_host.model   += mb.model;
-            mb_host.context += mb.context;
-            mb_host.compute += mb.compute;
-            seen_buffer_types.insert(buft);
-            continue;
-        }
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
-        if (dev) {
-            int i_dev = -1;
-            for (size_t i = 0; i < devices.size(); i++) {
-                if (devices[i] == dev) {
-                    i_dev = i;
-                    break;
-                }
-            }
-            if (i_dev != -1) {
-                mb_dev[i_dev].model   += mb.model;
-                mb_dev[i_dev].context += mb.context;
-                mb_dev[i_dev].compute += mb.compute;
-                seen_buffer_types.insert(buft);
-                continue;
-            }
-        }
-    }
-
-    // print memory breakdown for each device:
-    for (size_t i = 0; i < devices.size(); i++) {
-        ggml_backend_dev_t dev = devices[i];
-        llama_memory_breakdown_data mb = mb_dev[i];
-
-        const std::string name = ggml_backend_dev_name(dev);
-        std::string desc = ggml_backend_dev_description(dev);
-        for (const std::string & prefix : desc_prefixes_strip) {
-            if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
-                desc = desc.substr(prefix.length());
-            }
-        }
-
-        size_t free, total;
-        ggml_backend_dev_memory(dev, &free, &total);
-
-        const size_t self = mb.model + mb.context + mb.compute;
-        const int64_t unaccounted = static_cast<int64_t>(total) - static_cast<int64_t>(free) - static_cast<int64_t>(self);
-
-        table_data.push_back({
-            template_gpu,
-            "  - " + name + " (" + desc + ")",
-            std::to_string(total / MiB),
-            std::to_string(free / MiB),
-            std::to_string(self / MiB),
-            std::to_string(mb.model / MiB),
-            std::to_string(mb.context / MiB),
-            std::to_string(mb.compute / MiB),
-            std::to_string(unaccounted / static_cast<int64_t>(MiB))});
-    }
-
-    // print memory breakdown for host:
-    {
-        const size_t self = mb_host.model + mb_host.context + mb_host.compute;
-        table_data.push_back({
-            template_other,
-            "  - Host",
-            "", // total
-            "", // free
-            std::to_string(self / MiB),
-            std::to_string(mb_host.model / MiB),
-            std::to_string(mb_host.context / MiB),
-            std::to_string(mb_host.compute / MiB),
-            ""}); // unaccounted
-    }
-
-    // print memory breakdown for all remaining buffer types:
-    for (const auto & buft_mb : memory_breakdown) {
-        ggml_backend_buffer_type_t          buft = buft_mb.first;
-        const llama_memory_breakdown_data & mb   = buft_mb.second;
-        if (seen_buffer_types.count(buft) == 1) {
-            continue;
-        }
-        const std::string name = ggml_backend_buft_name(buft);
-        const size_t self = mb.model + mb.context + mb.compute;
-        table_data.push_back({
-            template_other,
-            "  - " + name,
-            "", // total
-            "", // free
-            std::to_string(self / MiB),
-            std::to_string(mb.model / MiB),
-            std::to_string(mb.context / MiB),
-            std::to_string(mb.compute / MiB),
-            ""}); // unaccounted
-        seen_buffer_types.insert(buft);
-    }
-
-    for (size_t j = 1; j < table_data[0].size(); j++) {
-        size_t max_len = 0;
-        for (const auto & td : table_data) {
-            max_len = std::max(max_len, td[j].length());
-        }
-        for (auto & td : table_data) {
-            td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
-        }
-    }
-    for (const auto & td : table_data) {
-        LOG_TRC(td[0].c_str(),
-            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
-            td[6].c_str(), td[7].c_str(), td[8].c_str());
-    }
-}
-
-void common_fit_print(
-        const char * path_model,
-        llama_model_params * mparams,
-        llama_context_params * cparams) {
-    std::vector<ggml_backend_dev_t> devs;
-    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
-    uint32_t hp_nct = 0; // hparams.n_ctx_train
-    uint32_t hp_nex = 0; // hparams.n_expert
-
-    auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
-    GGML_ASSERT(dmd.size() == devs.size() + 1);
-
-    for (size_t id = 0; id < devs.size(); id++) {
-        printf("%s ",  ggml_backend_dev_name(devs[id]));
-        printf("%zu ", dmd[id].mb.model/1024/1024);
-        printf("%zu ", dmd[id].mb.context/1024/1024);
-        printf("%zu ", dmd[id].mb.compute/1024/1024);
-        printf("\n");
-    }
-
-    printf("Host ");
-    printf("%zu ", dmd.back().mb.model/1024/1024);
-    printf("%zu ", dmd.back().mb.context/1024/1024);
-    printf("%zu ", dmd.back().mb.compute/1024/1024);
-    printf("\n");
-}
--- a/common/fit.h
+++ b/common/fit.h
@@ -1,32 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-enum common_params_fit_status {
-    COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
-    COMMON_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
-    COMMON_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occurred, e.g. because no model could be found at the specified path
-};
-
-// fits mparams and cparams to free device memory (assumes system memory is unlimited)
-//   - returns true if the parameters could be successfully modified to fit device memory
-//   - this function is NOT thread safe because it modifies the global llama logger state
-//   - only parameters that have the same value as in llama_default_model_params are modified
-//     with the exception of the context size which is modified if and only if equal to 0
-enum common_params_fit_status common_fit_params(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams,
-                                      float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
-    struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                     size_t * margins,               // margins of memory to leave per device in bytes
-                                   uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
-                        enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
-
-// print estimated memory to stdout
-void common_fit_print(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams);
-
-void common_memory_breakdown_print(const struct llama_context * ctx);
--- a/common/hf-cache.cpp
+++ b/common/hf-cache.cpp
@@ -11,6 +11,7 @@
 #include <filesystem>
 #include <fstream>
 #include <atomic>
+#include <regex> // migration only
 #include <string>
 #include <string_view>
 #include <stdexcept>
@@ -56,7 +57,7 @@ static fs::path get_cache_directory() {
 #ifndef _WIN32
        const struct passwd * pw = getpwuid(getuid());

-        if (pw && pw->pw_dir && *pw->pw_dir) {
+        if (pw->pw_dir && *pw->pw_dir) {
            return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
        }
 #endif
@@ -335,9 +336,15 @@ hf_files get_repo_files(const std::string & repo_id,
                if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) {
                    file.oid = item["lfs"]["oid"].get<std::string>();
                }
+                if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) {
+                    file.size = item["lfs"]["size"].get<size_t>();
+                }
            } else if (item.contains("oid") && item["oid"].is_string()) {
                file.oid = item["oid"].get<std::string>();
            }
+            if (file.size == 0 && item.contains("size") && item["size"].is_number()) {
+                file.size = item["size"].get<size_t>();
+            }

            if (!file.oid.empty() && !is_valid_oid(file.oid)) {
                LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str());
@@ -495,4 +502,271 @@ std::string finalize_file(const hf_file & file) {
    return file.final_path;
 }

+// delete everything after this line, one day
+
+// copied from download.cpp without the tag part
+struct gguf_split_info {
+    std::string prefix; // tag included
+    int index;
+    int count;
+};
+
+static gguf_split_info get_gguf_split_info(const std::string & path) {
+    static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
+    std::smatch m;
+
+    std::string prefix = path;
+    if (!string_remove_suffix(prefix, ".gguf")) {
+        return {};
+    }
+
+    int index = 1;
+    int count = 1;
+
+    if (std::regex_match(prefix, m, re_split)) {
+        index = std::stoi(m[2].str());
+        count = std::stoi(m[3].str());
+        prefix = m[1].str();
+    }
+
+    return {std::move(prefix), index, count};
+}
+
+static std::pair<std::string, std::string> parse_manifest_name(std::string & filename) {
+    static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)");
+    std::smatch match;
+    if (std::regex_match(filename, match, re)) {
+        return {match[1].str(), match[2].str()};
+    }
+    return {};
+}
+
+static std::string make_old_cache_filename(const std::string & owner,
+                                           const std::string & repo,
+                                           const std::string & filename) {
+    auto result = owner + "_" + repo + "_" + filename;
+    string_replace_all(result, "/", "_");
+    return result;
+}
+
+struct migrate_file {
+    std::string path;
+    std::string sha256;
+    size_t size;
+    fs::path old_path;
+    fs::path etag_path;
+    const hf_file * file;
+};
+
+using migrate_files = std::vector<migrate_file>;
+
+static bool collect_file(const fs::path    & old_cache,
+                         const std::string & owner,
+                         const std::string & repo,
+                         const std::string & path,
+                         const std::string & sha256,
+                         const hf_files    & files,
+                         migrate_files     & to_migrate) {
+
+    const hf_file * file = nullptr;
+
+    for (const auto & f : files) {
+        if (f.path == path) {
+            file = &f;
+            break;
+        }
+    }
+
+    std::string old_filename = make_old_cache_filename(owner, repo, path);
+    fs::path old_path = old_cache / old_filename;
+    fs::path etag_path = old_path.string() + ".etag";
+
+    if (!fs::exists(old_path)) {
+        if (file && fs::exists(file->final_path)) {
+            return true;
+        }
+        LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str());
+        return false;
+    }
+
+    if (!file) {
+        LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str());
+        return false;
+    }
+
+    if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) {
+        LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str());
+        return false;
+    }
+
+    if (file->size > 0) {
+        size_t size = fs::file_size(old_path);
+        if (size != file->size) {
+            LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size);
+            return false;
+        }
+    }
+
+    to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file});
+    return true;
+}
+
+static bool collect_files(const fs::path    & old_cache,
+                          const std::string & owner,
+                          const std::string & repo,
+                          const nl::json    & node,
+                          const hf_files    & files,
+                          migrate_files     & to_migrate) {
+
+    if (!node.contains("rfilename") ||
+        !node.contains("lfs")       ||
+        !node["lfs"].contains("sha256")) {
+        return true;
+    }
+
+    std::string path = node["rfilename"];
+    std::string sha256 = node["lfs"]["sha256"];
+
+    auto split = get_gguf_split_info(path);
+
+    if (split.count <= 1) {
+        return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate);
+    }
+
+    std::vector<std::pair<std::string, std::string>> splits;
+
+    for (const auto & f : files) {
+        auto split_f = get_gguf_split_info(f.path);
+        if (split_f.count == split.count && split_f.prefix == split.prefix) {
+            // sadly the manifest only provides the sha256 of the first file (index == 1)
+            // the rest will be verified using the size...
+            std::string f_sha256 = (split_f.index == 1) ? sha256 : "";
+            splits.emplace_back(f.path, f_sha256);
+        }
+    }
+
+    if ((int)splits.size() != split.count) {
+        LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size());
+        return false;
+    }
+
+    for (const auto & [f_path, f_sha256] : splits) {
+        if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool migrate_file(const migrate_file & file) {
+    std::error_code ec;
+
+    fs::path new_path(file.file->local_path);
+    fs::create_directories(new_path.parent_path(), ec);
+
+    if (!fs::exists(new_path, ec)) {
+        fs::rename(file.old_path, new_path, ec);
+        if (ec) {
+            fs::copy_file(file.old_path, new_path, ec);
+            if (ec) {
+                LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str());
+                return false;
+            }
+        }
+        fs::remove(file.old_path, ec);
+    }
+    fs::remove(file.etag_path, ec);
+
+    std::string filename = finalize_file(*file.file);
+    LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str());
+    return true;
+}
+
+void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) {
+    fs::path old_cache = fs_get_cache_directory();
+    if (!fs::exists(old_cache)) {
+        return;
+    }
+
+    if (offline) {
+        LOG_WRN("%s: skipping migration in offline mode (will run when online)\n", __func__);
+        return; // -hf is not going to work
+    }
+
+    bool warned = false;
+
+    for (const auto & entry : fs::directory_iterator(old_cache)) {
+        if (!entry.is_regular_file()) {
+            continue;
+        }
+        auto filename = entry.path().filename().string();
+        auto [owner, repo] = parse_manifest_name(filename);
+
+        if (owner.empty() || repo.empty()) {
+            continue;
+        }
+
+        if (!warned) {
+            warned = true;
+            LOG_WRN("================================================================================\n"
+                    "WARNING: Migrating cache to HuggingFace cache directory\n"
+                    "  Old cache: %s\n"
+                    "  New cache: %s\n"
+                    "This one-time migration moves models previously downloaded with -hf\n"
+                    "from the legacy llama.cpp cache to the standard HuggingFace cache.\n"
+                    "Models downloaded with --model-url are not affected.\n"
+                    "================================================================================\n",
+                    old_cache.string().c_str(), get_cache_directory().string().c_str());
+        }
+
+        auto repo_id = owner + "/" + repo;
+        auto files = get_repo_files(repo_id, token);
+
+        if (files.empty()) {
+            LOG_WRN("%s: could not get repo files for %s, skipping\n", __func__, repo_id.c_str());
+            continue;
+        }
+
+        migrate_files to_migrate;
+        bool ok = true;
+
+        try {
+            std::ifstream manifest(entry.path());
+            auto json = nl::json::parse(manifest);
+            for (const char * key : {"ggufFile", "mmprojFile"}) {
+                if (json.contains(key)) {
+                    if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) {
+                        ok = false;
+                        break;
+                    }
+                }
+            }
+        } catch (const std::exception & e) {
+            LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what());
+            continue;
+        }
+
+        if (!ok) {
+            LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__);
+            continue;
+        }
+
+        for (const auto & file : to_migrate) {
+            if (!migrate_file(file)) {
+                ok = false;
+                break;
+            }
+        }
+
+        if (!ok) {
+            LOG_WRN("%s: migration failed: could not migrate all files\n", __func__);
+            continue;
+        }
+
+        LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str());
+        fs::remove(entry.path());
+    }
+}
+
 } // namespace hf_cache
--- a/common/hf-cache.h
+++ b/common/hf-cache.h
@@ -14,6 +14,7 @@ struct hf_file {
    std::string final_path;
    std::string oid;
    std::string repo_id;
+    size_t size = 0; // only for the migration
 };

 using hf_files = std::vector<hf_file>;
@@ -29,4 +30,7 @@ hf_files get_cached_files(const std::string & repo_id = {});
 // Create snapshot path (link or move/copy) and return it
 std::string finalize_file(const hf_file & file);

+// TODO: Remove later
+void migrate_old_cache_to_hf_cache(const std::string & token, bool offline = false);
+
 } // namespace hf_cache
--- a/common/jinja/caps.cpp
+++ b/common/jinja/caps.cpp
@@ -1,3 +1,4 @@
+#include "log.h"
 #include "value.h"
 #include "runtime.h"
 #include "caps.h"
--- a/common/jinja/runtime.h
+++ b/common/jinja/runtime.h
@@ -106,16 +106,10 @@ struct statement {
    size_t pos; // position in source, for debugging
    virtual ~statement() = default;
    virtual std::string type() const { return "Statement"; }
-
    // execute_impl must be overridden by derived classes
-    virtual value execute_impl(context &) { throw_exec_error(); }
+    virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
    // execute is the public method to execute a statement with error handling
    value execute(context &);
-
-private:
-    [[noreturn]] void throw_exec_error() const {
-        throw std::runtime_error("cannot exec " + type());
-    }
 };

 // Type Checking Utilities
@@ -149,7 +143,7 @@ struct program : public statement {
    program() = default;
    explicit program(statements && body) : body(std::move(body)) {}
    std::string type() const override { return "Program"; }
-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
    }
 };
@@ -201,7 +195,7 @@ struct break_statement : public statement {
        }
    };

-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw break_statement::signal();
    }
 };
@@ -215,7 +209,7 @@ struct continue_statement : public statement {
        }
    };

-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw continue_statement::signal();
    }
 };
@@ -515,7 +509,7 @@ struct slice_expression : public expression {
        chk_type<expression>(this->step_expr);
    }
    std::string type() const override { return "SliceExpression"; }
-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw std::runtime_error("must be handled by MemberExpression");
    }
 };
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -590,10 +590,6 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
    return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
 }

-[[noreturn]] static value string_join_not_implemented(const func_args &) {
-    throw not_implemented_exception("String join builtin not implemented");
-}
-
 const func_builtins & value_string_t::get_builtins() const {
    static const func_builtins builtins = {
        {"default", default_value},
@@ -855,7 +851,9 @@ const func_builtins & value_string_t::get_builtins() const {
            res->val_str.mark_input_based_on(val_input->as_string());
            return res;
        }},
-        {"join", string_join_not_implemented},
+        {"join", [](const func_args &) -> value {
+            throw not_implemented_exception("String join builtin not implemented");
+        }},
    };
    return builtins;
 }
@@ -886,9 +884,6 @@ const func_builtins & value_bool_t::get_builtins() const {
    return builtins;
 }

-[[noreturn]] static value array_unique_not_implemented(const func_args &) {
-    throw not_implemented_exception("Array unique builtin not implemented");
-}

 const func_builtins & value_array_t::get_builtins() const {
    static const func_builtins builtins = {
@@ -1089,14 +1084,13 @@ const func_builtins & value_array_t::get_builtins() const {
            std::reverse(arr.begin(), arr.end());
            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
        }},
-        {"unique", array_unique_not_implemented},
+        {"unique", [](const func_args &) -> value {
+            throw not_implemented_exception("Array unique builtin not implemented");
+        }},
    };
    return builtins;
 }

-[[noreturn]] static value object_join_not_implemented(const func_args &) {
-    throw not_implemented_exception("object join not implemented");
-}

 const func_builtins & value_object_t::get_builtins() const {
    if (!has_builtins) {
@@ -1189,7 +1183,9 @@ const func_builtins & value_object_t::get_builtins() const {
            });
            return result;
        }},
-        {"join", object_join_not_implemented},
+        {"join", [](const func_args &) -> value {
+            throw not_implemented_exception("object join not implemented");
+        }},
    };
    return builtins;
 }
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -129,25 +129,27 @@ struct value_t {
    // Note: only for debugging and error reporting purposes
    virtual std::string type() const { return ""; }

-    virtual int64_t as_int() const { throw_type_error("is not an int value"); }
-    virtual double as_float() const { throw_type_error("is not a float value"); }
-    virtual string as_string() const { throw_type_error("is not a string value"); }
-    virtual bool as_bool() const { throw_type_error("is not a bool value"); }
-    virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
-    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
-    virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
+    virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
+    virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
+    virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
+    virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
+    virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
+    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
    virtual bool is_none() const { return false; }
    virtual bool is_undefined() const { return false; }
-    virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }
+    virtual const func_builtins & get_builtins() const {
+        throw std::runtime_error("No builtins available for type " + type());
+    }

-    virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
-    virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
-    virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
-    virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
-    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
-    virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
-    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
-    virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }
+    virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
+    virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }

    virtual bool is_numeric() const { return false; }
    virtual bool is_hashable() const { return false; }
@@ -161,11 +163,6 @@ struct value_t {
    // Note: only for debugging purposes
    virtual std::string as_repr() const { return as_string().str(); }

-private:
-    [[noreturn]] void throw_type_error(const char* expected) const {
-        throw std::runtime_error(type() + " " + expected);
-    }
-
 protected:
    virtual bool equivalent(const value_t &) const = 0;
    virtual bool nonequal(const value_t & other) const { return !equivalent(other); }
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -49,7 +49,7 @@ enum common_log_col : int {
 };

 // disable colors by default
-static const char* g_col[] = {
+static std::vector<const char *> g_col = {
    "",
    "",
    "",
@@ -247,6 +247,7 @@ public:

            entries = std::move(new_entries);
        }
+
        cv.notify_one();
    }

@@ -264,6 +265,7 @@ public:
                {
                    std::unique_lock<std::mutex> lock(mtx);
                    cv.wait(lock, [this]() { return head != tail; });
+
                    cur = entries[head];

                    head = (head + 1) % entries.size();
@@ -299,6 +301,7 @@ public:

                tail = (tail + 1) % entries.size();
            }
+
            cv.notify_one();
        }

@@ -335,7 +338,7 @@ public:
            g_col[COMMON_LOG_COL_CYAN]    = LOG_COL_CYAN;
            g_col[COMMON_LOG_COL_WHITE]   = LOG_COL_WHITE;
        } else {
-            for (size_t i = 0; i < std::size(g_col); i++) {
+            for (size_t i = 0; i < g_col.size(); i++) {
                g_col[i] = "";
            }
        }
@@ -365,20 +368,14 @@ struct common_log * common_log_init() {
 }

 struct common_log * common_log_main() {
-    // We intentionally leak (i.e. do not delete) the logger singleton because
-    // common_log destructor called at DLL teardown phase will cause hanging on Windows.
-    // OS will release resources anyway so it should not be a significant issue,
-    // though this design may cause logs to be lost if not flushed before the program exits.
-    // Refer to https://github.com/ggml-org/llama.cpp/issues/22142 for details.
-    static struct common_log * log;
+    static struct common_log log;
    static std::once_flag    init_flag;
    std::call_once(init_flag, [&]() {
-        log = new common_log;
        // Set default to auto-detect colors
-        log->set_colors(tty_can_use_colors());
+        log.set_colors(tty_can_use_colors());
    });

-    return log;
+    return &log;
 }

 void common_log_pause(struct common_log * log) {
@@ -435,10 +432,10 @@ void common_log_flush(struct common_log * log) {
 static int common_get_verbosity(enum ggml_log_level level) {
    switch (level) {
        case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
-        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_TRACE;
+        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_INFO;
        case GGML_LOG_LEVEL_WARN:  return LOG_LEVEL_WARN;
        case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
-        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_TRACE;
+        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_INFO; // same as INFO
        case GGML_LOG_LEVEL_NONE:
        default:
            return LOG_LEVEL_OUTPUT;
--- a/common/log.h
+++ b/common/log.h
@@ -21,8 +21,7 @@
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

-#define LOG_LEVEL_DEBUG  5
-#define LOG_LEVEL_TRACE  4
+#define LOG_LEVEL_DEBUG  4
 #define LOG_LEVEL_INFO   3
 #define LOG_LEVEL_WARN   2
 #define LOG_LEVEL_ERROR  1
@@ -50,11 +49,7 @@ void common_log_default_callback(enum ggml_log_level level, const char * text, v
 struct common_log;

 struct common_log * common_log_init();
-
-// Singleton, intentionally leaked to avoid Windows teardown hangs.
-// Call common_log_flush() before exit if you want to ensure all logs are flushed.
-struct common_log * common_log_main();
-
+struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
 void                common_log_pause (struct common_log * log); // pause  the worker thread, not thread-safe
 void                common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
 void                common_log_free  (struct common_log * log);
@@ -112,15 +107,13 @@ void common_log_flush         (struct common_log * log);                    // f
 #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity,        __VA_ARGS__)

 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG,  __VA_ARGS__)
-#define LOG_TRC(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_TRACE,  __VA_ARGS__)
 #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_INFO,   __VA_ARGS__)
 #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  LOG_LEVEL_WARN,   __VA_ARGS__)
 #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR,  __VA_ARGS__)
 #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  LOG_LEVEL_INFO,   __VA_ARGS__) // same as INFO

-#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
-#define LOG_TRCV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_TRACE, verbosity, __VA_ARGS__)
 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
 #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
 #define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -471,7 +471,7 @@ void common_ngram_map_draft(common_ngram_map & map,
        sum_occur += curr_occur;
    }

-    LOG_DBG("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__,
+    LOG_INF("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__,
            key_offset,
            max_occur, sum_occur, slot_max,
            curr_key.values[0].value_idx, curr_key.values[0].value_num,
@@ -482,7 +482,7 @@ void common_ngram_map_draft(common_ngram_map & map,
    // Print the tokens of the four values (if idx != 0), use LOG_INF
    for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
        if (curr_key.values[v].value_idx != 0) {
-            LOG_DBG("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str());
+            LOG_INF("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str());
        }
    }

@@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map,
        draft.push_back(inp[match_pos + n + i]);
    }

-    LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
+    LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
            key_offset, slot_max,
            curr_key.key_num, draft.size());

--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -43,7 +43,7 @@ static std::set<std::string> get_remote_preset_whitelist(const std::map<std::str
    for (const auto & it : key_to_opt) {
        const std::string & key = it.first;
        const common_arg & opt = it.second;
-        if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
+        if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
            allowed_keys.insert(key);
            // also add variant keys (args without leading dashes and env vars)
            for (const auto & arg : opt.get_args()) {
@@ -163,13 +163,8 @@ void common_preset::merge(const common_preset & other) {
    }
 }

-void common_preset::apply_to_params(common_params & params, const std::set<std::string> & handled_keys) const {
+void common_preset::apply_to_params(common_params & params) const {
    for (const auto & [opt, val] : options) {
-        if (!handled_keys.empty()) {
-            if (!opt.env || handled_keys.find(opt.env) == handled_keys.end()) {
-                continue;
-            }
-        }
        // apply each option to params
        if (opt.handler_string) {
            opt.handler_string(params, val);
--- a/common/preset.h
+++ b/common/preset.h
@@ -43,8 +43,7 @@ struct common_preset {
    void merge(const common_preset & other);

    // apply preset options to common_params
-    // optionally specify handled_keys to only apply a subset of options (identified by their env), if empty, apply all options
-    void apply_to_params(common_params & params, const std::set<std::string> & handled_keys = std::set<std::string>()) const;
+    void apply_to_params(common_params & params) const;
 };

 // interface for multiple presets in one file
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@@ -122,20 +122,6 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
            }
            break;
        case REASONING_BUDGET_DONE:
-            // Re-arm on a new start tag: some models emit multiple <think> blocks
-            // per response, and each should get a fresh budget window.
-            if (ctx->start_matcher.advance(token)) {
-                ctx->state = REASONING_BUDGET_COUNTING;
-                ctx->remaining = ctx->budget;
-                ctx->end_matcher.reset();
-                LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget);
-
-                if (ctx->remaining <= 0) {
-                    ctx->state = REASONING_BUDGET_FORCING;
-                    ctx->force_pos = 0;
-                    LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
-                }
-            }
            break;
    }
 }
@@ -171,12 +157,22 @@ static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
    ctx->force_pos = 0;
 }

+// forward declaration for use in clone
 static struct llama_sampler * common_reasoning_budget_init_state(
        const struct llama_vocab * vocab, const std::vector<llama_token> & start_tokens,
        const std::vector<llama_token> & end_tokens, const std::vector<llama_token> & forced_tokens,
        int32_t budget, common_reasoning_budget_state initial_state);

-static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl);
+static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
+    return common_reasoning_budget_init_state(
+        ctx->vocab,
+        ctx->start_matcher.tokens,
+        ctx->end_matcher.tokens,
+        ctx->forced_tokens,
+        ctx->budget,
+        ctx->state);
+}

 static void common_reasoning_budget_free(struct llama_sampler * smpl) {
    delete (common_reasoning_budget_ctx *) smpl->ctx;
@@ -195,15 +191,6 @@ static struct llama_sampler_i common_reasoning_budget_i = {
    /* .backend_set_input = */ nullptr,
 };

-static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
-
-    return llama_sampler_init(
-        /* .iface = */ &common_reasoning_budget_i,
-        /* .ctx   = */ new common_reasoning_budget_ctx(*ctx)
-    );
-}
-
 static struct llama_sampler * common_reasoning_budget_init_state(
        const struct llama_vocab             * vocab,
        const std::vector<llama_token>       & start_tokens,
@@ -231,6 +218,34 @@ static struct llama_sampler * common_reasoning_budget_init_state(
    );
 }

+struct llama_sampler * common_reasoning_budget_init(
+        const struct llama_vocab       * vocab,
+        const std::vector<llama_token> & start_tokens,
+        const std::vector<llama_token> & end_tokens,
+        const std::vector<llama_token> & forced_tokens,
+        int32_t                          budget,
+        const std::vector<llama_token> & prefill_tokens) {
+    // Determine initial state from prefill: COUNTING if the prefill begins with
+    // the start sequence but does not also contain the end sequence after it.
+    common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE;
+    if (!prefill_tokens.empty() && !start_tokens.empty() &&
+            prefill_tokens.size() >= start_tokens.size() &&
+            std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) {
+        initial_state = REASONING_BUDGET_COUNTING;
+        // If the end sequence also follows the start in the prefill, reasoning
+        // was opened and immediately closed — stay IDLE.
+        if (!end_tokens.empty() &&
+                prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) {
+            auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size();
+            if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() &&
+                    std::equal(end_tokens.begin(), end_tokens.end(), end_start)) {
+                initial_state = REASONING_BUDGET_IDLE;
+            }
+        }
+    }
+    return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
+}
+
 struct llama_sampler * common_reasoning_budget_init(
        const struct llama_vocab       * vocab,
        const std::vector<llama_token> & start_tokens,
--- a/common/reasoning-budget.h
+++ b/common/reasoning-budget.h
@@ -29,7 +29,10 @@ enum common_reasoning_budget_state {
 //   end_tokens     - token sequence for natural deactivation
 //   forced_tokens  - token sequence forced when budget expires
 //   budget         - max tokens allowed in the reasoning block
-//   initial_state  - initial state
+//   prefill_tokens - tokens already present in the prompt (generation prompt);
+//                    used to determine the initial state: COUNTING if they begin
+//                    with start_tokens (but don't also end with end_tokens),
+//                    IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING.
 //
 struct llama_sampler * common_reasoning_budget_init(
        const struct llama_vocab       * vocab,
@@ -37,6 +40,16 @@ struct llama_sampler * common_reasoning_budget_init(
        const std::vector<llama_token> & end_tokens,
        const std::vector<llama_token> & forced_tokens,
        int32_t                          budget,
-        common_reasoning_budget_state    initial_state = REASONING_BUDGET_IDLE);
+        const std::vector<llama_token> & prefill_tokens = {});
+
+// Variant that takes an explicit initial state (used by tests and clone).
+// COUNTING with budget <= 0 is promoted to FORCING.
+struct llama_sampler * common_reasoning_budget_init(
+        const struct llama_vocab       * vocab,
+        const std::vector<llama_token> & start_tokens,
+        const std::vector<llama_token> & end_tokens,
+        const std::vector<llama_token> & forced_tokens,
+        int32_t                          budget,
+        common_reasoning_budget_state    initial_state);

 common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,12 +1,10 @@
 #include "sampling.h"

 #include "common.h"
-#include "fit.h"
+#include "ggml.h"
 #include "log.h"
 #include "reasoning-budget.h"

-#include "ggml.h"
-
 #include <algorithm>
 #include <cctype>
 #include <climits>
@@ -260,35 +258,32 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        }
    }

-    // Compute prefill tokens from the generation prompt
-    std::vector<llama_token> prefill_tokens;
-    if (!params.generation_prompt.empty()) {
-        GGML_ASSERT(vocab != nullptr);
-        auto tokens = common_tokenize(vocab, params.generation_prompt, false, true);
-        for (size_t i = 0; i < tokens.size(); i++) {
-            std::string piece = common_token_to_piece(vocab, tokens[i], true);
-            if (i == 0 && std::isspace(piece[0]) && !std::isspace(params.generation_prompt[0])) {
-                // Some tokenizers will add a space before the first special token, need to exclude
-                continue;
-            }
-            LOG_DBG("%s: prefill token: %d = %s\n", __func__, tokens[i], piece.c_str());
-            prefill_tokens.push_back(tokens[i]);
-        }
-    }
-
    // Feed generation prompt tokens to the grammar sampler so it advances past
    // tokens the template already placed in the prompt.
    // Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
-    if (grmr && !params.grammar_lazy && common_grammar_needs_prefill(params.grammar)) {
-        try {
-            for (const auto & token : prefill_tokens) {
-                llama_sampler_accept(grmr, token);
-                LOG_DBG("%s: grammar accepted prefill token (%d)\n", __func__, token);
+    std::vector<llama_token> prefill_tokens;
+    if (!params.generation_prompt.empty() && common_grammar_needs_prefill(params.grammar)) {
+        GGML_ASSERT(vocab != nullptr);
+        prefill_tokens = common_tokenize(vocab, params.generation_prompt, false, true);
+        if (!prefill_tokens.empty()) {
+            std::string first_token = common_token_to_piece(vocab, prefill_tokens[0], true);
+            if (std::isspace(first_token[0]) && !std::isspace(params.generation_prompt[0])) {
+                // Some tokenizers will add a space before the first special token, need to remove
+                prefill_tokens = std::vector<llama_token>(prefill_tokens.begin() + 1, prefill_tokens.end());
+            }
+        }
+
+        if (grmr && !params.grammar_lazy) {
+            try {
+                for (const auto & token : prefill_tokens) {
+                    llama_sampler_accept(grmr, token);
+                    LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
+                }
+            } catch (std::exception &e) {
+                LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
+                    common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
+                throw e;
            }
-        } catch (std::exception &e) {
-            LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
-                common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
-            throw e;
        }
    }

@@ -299,12 +294,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
            params.reasoning_budget_start,
            params.reasoning_budget_end,
            params.reasoning_budget_forced,
-            params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens);
-
-        for (const auto & token : prefill_tokens) {
-            llama_sampler_accept(rbudget, token);
-            LOG_DBG("%s: reasoning-budget accepted prefill token (%d)\n", __func__, token);
-        }
+            params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
+            prefill_tokens);
    }

    if (params.has_logit_bias()) {
@@ -438,7 +429,7 @@ static bool grammar_should_apply(struct common_sampler * gsmpl) {
    return true;
 }

-void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated) {
+void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    if (!gsmpl) {
        return;
    }
@@ -446,11 +437,9 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
    const auto tm = gsmpl->tm();

    // grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
-    const auto accept_grammar = is_generated && grammar_should_apply(gsmpl);
+    accept_grammar = accept_grammar && grammar_should_apply(gsmpl);

-    if (gsmpl->rbudget && is_generated) {
-        llama_sampler_accept(gsmpl->rbudget, token);
-    }
+    llama_sampler_accept(gsmpl->rbudget, token);

    if (gsmpl->grmr && accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
@@ -522,7 +511,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);

-        common_memory_breakdown_print(ctx);
+        llama_memory_breakdown_print(ctx);
    }
 }

@@ -547,8 +536,6 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

-    gsmpl->set_logits(ctx, idx);
-
    // Check if a backend sampler has already sampled a token in which case we
    // return that token id directly.
    {
@@ -560,17 +547,17 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
            GGML_ASSERT(!gsmpl->grmr    && "using grammar in combination with backend sampling is not supported");
            GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");

-            for (size_t i = 0; i < cur_p.size; ++i) {
-                if (cur_p.data[i].id == id) {
-                    cur_p.selected = i;
-                    break;
-                }
-            }
+            // TODO: simplify
+            gsmpl->cur.resize(1);
+            gsmpl->cur[0] = { id, 0.0f, 1.0f };
+            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };

            return id;
        }
    }

+    gsmpl->set_logits(ctx, idx);
+
    // apply reasoning budget first
    llama_sampler_apply(rbudget, &cur_p);

--- a/common/sampling.h
+++ b/common/sampling.h
@@ -41,8 +41,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st

 void common_sampler_free(struct common_sampler * gsmpl);

-// if is_generated is true, the token is accepted by the sampling chain, the reasoning budget sampler, and the grammar sampler
-void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated);
+// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
+void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
 void                    common_sampler_reset (struct common_sampler * gsmpl);
 struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);

--- a/common/speculative.cpp
+++ b/common/speculative.cpp
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -5,14 +5,8 @@

 struct common_speculative;

-// comma separated list the provided types
-std::string common_speculative_type_name_str(const std::vector<enum common_speculative_type> & types);
-
 // comma separated list of all types
-const char * common_speculative_all_types_str();
-
-// parse user provided types
-std::vector<enum common_speculative_type> common_speculative_types_from_names(const std::vector<std::string> & names);
+std::string common_speculative_type_name_str();

 // convert string to type
 enum common_speculative_type common_speculative_type_from_name(const std::string & name);
@@ -20,50 +14,24 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 // convert type to string
 std::string common_speculative_type_to_str(enum common_speculative_type type);

-common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq);
+common_speculative * common_speculative_init(
+        common_params_speculative & params,
+        llama_context             * ctx_tgt);

 void common_speculative_free(common_speculative * spec);

-struct common_speculative_draft_params {
-    // this flag is used to chain the drafts through all the available implementations
-    // after the first successful draft from an implementation, we set it
-    //   to false to prevent further drafts for that sequence
-    // at the end of the draft() call, all drafting flags will be reset to false
-    bool drafting = false;
-
-    // overrides individual configurations (-1 disabled)
-    // can be used to constraint the max draft based on the remaining context size
-    int32_t n_max = -1;
-
-    llama_pos   n_past;
-    llama_token id_last;
-
-    // TODO: remove in the future by keeping track of the prompt from the _begin() call and the consecutive accept calls
-    const llama_tokens * prompt;
-
-    // the generated draft from the last _draft() call
-    llama_tokens * result;
-};
-
-common_speculative_draft_params & common_speculative_get_draft_params(common_speculative * spec, llama_seq_id seq_id);
-
 // optionally call once at the beginning of a new generation
-void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, const llama_tokens & prompt);
+void common_speculative_begin(common_speculative * spec, const llama_tokens & prompt);

-// process the batch and update the internal state of the speculative context
-bool common_speculative_process(common_speculative * spec, const llama_batch & batch);
+// sample up to n_draft tokens and add them to the batch using the draft model
+llama_tokens common_speculative_draft(
+                     common_speculative * spec,
+        const common_params_speculative & params,
+                     const llama_tokens & prompt,
+                            llama_token   id_last);

-// true if any implementation requires target post-norm embeddings to be extracted
-bool common_speculative_need_embd(common_speculative * spec);
-
-// true if any implementation requires target pre-norm embeddings to be extracted
-bool common_speculative_need_embd_pre_norm(common_speculative * spec);
-
-// generate drafts for the sequences specified with `common_speculative_get_draft_params`
-void common_speculative_draft(common_speculative * spec);
-
-// informs the speculative context that n_accepted tokens were accepted by the target model
-void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);
+// informs the speculative decoder that n_accepted tokens were accepted by the target model
+void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);

 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
--- a/conversion/init.py
+++ b/conversion/init.py
@@ -1,333 +0,0 @@
-from __future__ import annotations
-
-from .base import (
-    ModelBase, TextModel, MmprojModel, ModelType, SentencePieceTokenTypes,
-    logger, _mistral_common_installed, _mistral_import_error_msg,
-    get_model_architecture, LazyTorchTensor,
-)
-from typing import Type
-
-
-__all__ = [
-    "ModelBase", "TextModel", "MmprojModel", "ModelType", "SentencePieceTokenTypes",
-    "get_model_architecture", "LazyTorchTensor", "logger",
-    "_mistral_common_installed", "_mistral_import_error_msg",
-    "get_model_class", "print_registered_models", "load_all_models",
-]
-
-
-TEXT_MODEL_MAP: dict[str, str] = {
-    "AfmoeForCausalLM": "afmoe",
-    "ApertusForCausalLM": "llama",
-    "ArceeForCausalLM": "llama",
-    "ArcticForCausalLM": "arctic",
-    "AudioFlamingo3ForConditionalGeneration": "qwen",
-    "BaiChuanForCausalLM": "baichuan",
-    "BaichuanForCausalLM": "baichuan",
-    "BailingMoeForCausalLM": "bailingmoe",
-    "BailingMoeV2ForCausalLM": "bailingmoe",
-    "BambaForCausalLM": "granite",
-    "BertForMaskedLM": "bert",
-    "BertForSequenceClassification": "bert",
-    "BertModel": "bert",
-    "BitnetForCausalLM": "bitnet",
-    "BloomForCausalLM": "bloom",
-    "BloomModel": "bloom",
-    "CamembertModel": "bert",
-    "ChameleonForCausalLM": "chameleon",
-    "ChameleonForConditionalGeneration": "chameleon",
-    "ChatGLMForConditionalGeneration": "chatglm",
-    "ChatGLMModel": "chatglm",
-    "CodeShellForCausalLM": "codeshell",
-    "CogVLMForCausalLM": "cogvlm",
-    "Cohere2ForCausalLM": "command_r",
-    "CohereForCausalLM": "command_r",
-    "DbrxForCausalLM": "dbrx",
-    "DeciLMForCausalLM": "deci",
-    "DeepseekForCausalLM": "deepseek",
-    "DeepseekV2ForCausalLM": "deepseek",
-    "DeepseekV3ForCausalLM": "deepseek",
-    "DistilBertForMaskedLM": "bert",
-    "DistilBertForSequenceClassification": "bert",
-    "DistilBertModel": "bert",
-    "Dots1ForCausalLM": "dots1",
-    "DotsOCRForCausalLM": "qwen",
-    "DreamModel": "dream",
-    "Ernie4_5ForCausalLM": "ernie",
-    "Ernie4_5_ForCausalLM": "ernie",
-    "Ernie4_5_MoeForCausalLM": "ernie",
-    "EuroBertModel": "bert",
-    "Exaone4ForCausalLM": "exaone",
-    "ExaoneForCausalLM": "exaone",
-    "ExaoneMoEForCausalLM": "exaone",
-    "FalconForCausalLM": "falcon",
-    "FalconH1ForCausalLM": "falcon_h1",
-    "FalconMambaForCausalLM": "mamba",
-    "GPT2LMHeadModel": "gpt2",
-    "GPTBigCodeForCausalLM": "starcoder",
-    "GPTNeoXForCausalLM": "gptneox",
-    "GPTRefactForCausalLM": "refact",
-    "Gemma2ForCausalLM": "gemma",
-    "Gemma3ForCausalLM": "gemma",
-    "Gemma3ForConditionalGeneration": "gemma",
-    "Gemma3TextModel": "gemma",
-    "Gemma3nForCausalLM": "gemma",
-    "Gemma3nForConditionalGeneration": "gemma",
-    "Gemma4ForConditionalGeneration": "gemma",
-    "GemmaForCausalLM": "gemma",
-    "Glm4ForCausalLM": "glm",
-    "Glm4MoeForCausalLM": "glm",
-    "Glm4MoeLiteForCausalLM": "glm",
-    "Glm4vForConditionalGeneration": "glm",
-    "Glm4vMoeForConditionalGeneration": "glm",
-    "GlmForCausalLM": "chatglm",
-    "GlmMoeDsaForCausalLM": "glm",
-    "GlmOcrForConditionalGeneration": "glm",
-    "GptOssForCausalLM": "gpt_oss",
-    "GraniteForCausalLM": "granite",
-    "GraniteMoeForCausalLM": "granite",
-    "GraniteMoeHybridForCausalLM": "granite",
-    "GraniteMoeSharedForCausalLM": "granite",
-    "GraniteSpeechForConditionalGeneration": "granite",
-    "Grok1ForCausalLM": "grok",
-    "GrokForCausalLM": "grok",
-    "GroveMoeForCausalLM": "grovemoe",
-    "HunYuanDenseV1ForCausalLM": "hunyuan",
-    "HunYuanMoEV1ForCausalLM": "hunyuan",
-    "HunYuanVLForConditionalGeneration": "hunyuan",
-    "IQuestCoderForCausalLM": "llama",
-    "InternLM2ForCausalLM": "internlm",
-    "InternLM3ForCausalLM": "internlm",
-    "JAISLMHeadModel": "jais",
-    "Jais2ForCausalLM": "jais",
-    "JambaForCausalLM": "jamba",
-    "JanusForConditionalGeneration": "januspro",
-    "JinaBertForMaskedLM": "bert",
-    "JinaBertModel": "bert",
-    "JinaEmbeddingsV5Model": "bert",
-    "KORMoForCausalLM": "qwen",
-    "KimiK25ForConditionalGeneration": "deepseek",
-    "KimiLinearForCausalLM": "kimi_linear",
-    "KimiLinearModel": "kimi_linear",
-    "KimiVLForConditionalGeneration": "deepseek",
-    "LFM2ForCausalLM": "lfm2",
-    "LLaDAMoEModel": "llada",
-    "LLaDAMoEModelLM": "llada",
-    "LLaDAModelLM": "llada",
-    "LLaMAForCausalLM": "llama",
-    "Lfm25AudioTokenizer": "lfm2",
-    "Lfm2ForCausalLM": "lfm2",
-    "Lfm2Model": "lfm2",
-    "Lfm2MoeForCausalLM": "lfm2",
-    "Llama4ForCausalLM": "llama",
-    "Llama4ForConditionalGeneration": "llama",
-    "LlamaBidirectionalModel": "llama",
-    "LlamaForCausalLM": "llama",
-    "LlamaModel": "llama",
-    "LlavaForConditionalGeneration": "llama",
-    "LlavaStableLMEpochForCausalLM": "stablelm",
-    "MPTForCausalLM": "mpt",
-    "MT5ForConditionalGeneration": "t5",
-    "MaincoderForCausalLM": "maincoder",
-    "Mamba2ForCausalLM": "mamba",
-    "MambaForCausalLM": "mamba",
-    "MambaLMHeadModel": "mamba",
-    "MiMoV2FlashForCausalLM": "mimo",
-    "MiMoV2ForCausalLM": "mimo",
-    "MiniCPM3ForCausalLM": "minicpm",
-    "MiniCPMForCausalLM": "minicpm",
-    "MiniCPMV4_6ForConditionalGeneration": "minicpm",
-    "MiniMaxM2ForCausalLM": "minimax",
-    "Ministral3ForCausalLM": "mistral3",
-    "Mistral3ForConditionalGeneration": "mistral3",
-    "MistralForCausalLM": "llama",
-    "MixtralForCausalLM": "llama",
-    "ModernBertForMaskedLM": "bert",
-    "ModernBertForSequenceClassification": "bert",
-    "ModernBertModel": "bert",
-    "NemotronForCausalLM": "nemotron",
-    "NemotronHForCausalLM": "nemotron",
-    "NeoBERT": "bert",
-    "NeoBERTForSequenceClassification": "bert",
-    "NeoBERTLMHead": "bert",
-    "NomicBertModel": "bert",
-    "OLMoForCausalLM": "olmo",
-    "Olmo2ForCausalLM": "olmo",
-    "Olmo3ForCausalLM": "olmo",
-    "OlmoForCausalLM": "olmo",
-    "OlmoeForCausalLM": "olmo",
-    "OpenELMForCausalLM": "openelm",
-    "OrionForCausalLM": "orion",
-    "PLMForCausalLM": "plm",
-    "PLaMo2ForCausalLM": "plamo",
-    "PLaMo3ForCausalLM": "plamo",
-    "PaddleOCRVLForConditionalGeneration": "ernie",
-    "PanguEmbeddedForCausalLM": "pangu",
-    "Phi3ForCausalLM": "phi",
-    "Phi4ForCausalLMV": "phi",
-    "PhiForCausalLM": "phi",
-    "PhiMoEForCausalLM": "phi",
-    "Plamo2ForCausalLM": "plamo",
-    "Plamo3ForCausalLM": "plamo",
-    "PlamoForCausalLM": "plamo",
-    "QWenLMHeadModel": "qwen",
-    "Qwen2AudioForConditionalGeneration": "qwen",
-    "Qwen2ForCausalLM": "qwen",
-    "Qwen2Model": "qwen",
-    "Qwen2MoeForCausalLM": "qwen",
-    "Qwen2VLForConditionalGeneration": "qwenvl",
-    "Qwen2VLModel": "qwenvl",
-    "Qwen2_5OmniModel": "qwenvl",
-    "Qwen2_5_VLForConditionalGeneration": "qwenvl",
-    "Qwen3ASRForConditionalGeneration": "qwen3vl",
-    "Qwen3ForCausalLM": "qwen",
-    "Qwen3Model": "qwen",
-    "Qwen3MoeForCausalLM": "qwen",
-    "Qwen3NextForCausalLM": "qwen",
-    "Qwen3OmniMoeForConditionalGeneration": "qwen3vl",
-    "Qwen3VLForConditionalGeneration": "qwen3vl",
-    "Qwen3VLMoeForConditionalGeneration": "qwen3vl",
-    "Qwen3_5ForCausalLM": "qwen",
-    "Qwen3_5ForConditionalGeneration": "qwen",
-    "Qwen3_5MoeForCausalLM": "qwen",
-    "Qwen3_5MoeForConditionalGeneration": "qwen",
-    "RND1": "qwen",
-    "RWForCausalLM": "falcon",
-    "RWKV6Qwen2ForCausalLM": "rwkv",
-    "RWKV7ForCausalLM": "rwkv",
-    "RobertaForSequenceClassification": "bert",
-    "RobertaModel": "bert",
-    "RuGPT3XLForCausalLM": "gpt2",
-    "Rwkv6ForCausalLM": "rwkv",
-    "Rwkv7ForCausalLM": "rwkv",
-    "RwkvHybridForCausalLM": "rwkv",
-    "Sarashina2VisionForCausalLM": "sarashina2",
-    "SarvamMoEForCausalLM": "bailingmoe",
-    "SeedOssForCausalLM": "olmo",
-    "SmallThinkerForCausalLM": "smallthinker",
-    "SmolLM3ForCausalLM": "llama",
-    "SolarOpenForCausalLM": "glm",
-    "StableLMEpochForCausalLM": "stablelm",
-    "StableLmForCausalLM": "stablelm",
-    "Starcoder2ForCausalLM": "starcoder",
-    "Step3p5ForCausalLM": "step3",
-    "StepVLForConditionalGeneration": "step3",
-    "T5EncoderModel": "t5",
-    "T5ForConditionalGeneration": "t5",
-    "T5WithLMHeadModel": "t5",
-    "UMT5ForConditionalGeneration": "t5",
-    "UMT5Model": "t5",
-    "UltravoxModel": "ultravox",
-    "VLlama3ForCausalLM": "llama",
-    "VoxtralForConditionalGeneration": "llama",
-    "WavTokenizerDec": "wavtokenizer",
-    "XLMRobertaForSequenceClassification": "bert",
-    "XLMRobertaModel": "bert",
-    "XverseForCausalLM": "xverse",
-    "YoutuForCausalLM": "deepseek",
-    "YoutuVLForConditionalGeneration": "deepseek",
-    "modeling_grove_moe.GroveMoeForCausalLM": "grovemoe",
-    "modeling_sarvam_moe.SarvamMoEForCausalLM": "bailingmoe",
-}
-
-
-MMPROJ_MODEL_MAP: dict[str, str] = {
-    "AudioFlamingo3ForConditionalGeneration": "ultravox",
-    "CogVLMForCausalLM": "cogvlm",
-    "DeepseekOCRForCausalLM": "deepseek",
-    "DotsOCRForCausalLM": "dotsocr",
-    "Gemma3ForConditionalGeneration": "gemma",
-    "Gemma3nForConditionalGeneration": "gemma",
-    "Gemma4ForConditionalGeneration": "gemma",
-    "Glm4vForConditionalGeneration": "qwen3vl",
-    "Glm4vMoeForConditionalGeneration": "qwen3vl",
-    "GlmOcrForConditionalGeneration": "qwen3vl",
-    "GlmasrModel": "ultravox",
-    "GraniteSpeechForConditionalGeneration": "granite",
-    "HunYuanVLForConditionalGeneration": "hunyuan",
-    "Idefics3ForConditionalGeneration": "smolvlm",
-    "InternVisionModel": "internvl",
-    "JanusForConditionalGeneration": "januspro",
-    "KimiK25ForConditionalGeneration": "kimivl",
-    "KimiVLForConditionalGeneration": "kimivl",
-    "Lfm2AudioForConditionalGeneration": "lfm2",
-    "Lfm2VlForConditionalGeneration": "lfm2",
-    "LightOnOCRForConditionalGeneration": "lighton_ocr",
-    "Llama4ForConditionalGeneration": "llama4",
-    "LlavaForConditionalGeneration": "llava",
-    "MERaLiON2ForConditionalGeneration": "ultravox",
-    "MiMoV2ForCausalLM": "mimo",
-    "MiniCPMV4_6ForConditionalGeneration": "minicpm",
-    "Mistral3ForConditionalGeneration": "llava",
-    "NemotronH_Nano_VL_V2": "nemotron",
-    "PaddleOCRVisionModel": "ernie",
-    "Phi4ForCausalLMV": "phi",
-    "Qwen2AudioForConditionalGeneration": "ultravox",
-    "Qwen2VLForConditionalGeneration": "qwenvl",
-    "Qwen2VLModel": "qwenvl",
-    "Qwen2_5OmniModel": "qwenvl",
-    "Qwen2_5_VLForConditionalGeneration": "qwenvl",
-    "Qwen3ASRForConditionalGeneration": "qwen3vl",
-    "Qwen3OmniMoeForConditionalGeneration": "qwen3vl",
-    "Qwen3VLForConditionalGeneration": "qwen3vl",
-    "Qwen3VLMoeForConditionalGeneration": "qwen3vl",
-    "Qwen3_5ForConditionalGeneration": "qwen3vl",
-    "Qwen3_5MoeForConditionalGeneration": "qwen3vl",
-    "RADIOModel": "nemotron",
-    "Sarashina2VisionForCausalLM": "sarashina2",
-    "SmolVLMForConditionalGeneration": "smolvlm",
-    "StepVLForConditionalGeneration": "step3",
-    "UltravoxModel": "ultravox",
-    "VoxtralForConditionalGeneration": "ultravox",
-    "YoutuVLForConditionalGeneration": "youtuvl",
-}
-
-
-_TEXT_MODEL_MODULES = sorted(set(TEXT_MODEL_MAP.values()))
-_MMPROJ_MODEL_MODULES = sorted(set(MMPROJ_MODEL_MAP.values()))
-
-
-_loaded_text_modules: set[str] = set()
-_loaded_mmproj_modules: set[str] = set()
-
-
-def load_all_models() -> None:
-    """Import all model modules to trigger @ModelBase.register() decorators."""
-    if len(_loaded_text_modules) != len(_TEXT_MODEL_MODULES):
-        for module_name in _TEXT_MODEL_MODULES:
-            if module_name not in _loaded_text_modules:
-                try:
-                    __import__(f"conversion.{module_name}")
-                    _loaded_text_modules.add(module_name)
-                except Exception as e:
-                    logger.warning(f"Failed to load model module {module_name}: {e}")
-
-    if len(_loaded_mmproj_modules) != len(_MMPROJ_MODEL_MODULES):
-        for module_name in _MMPROJ_MODEL_MODULES:
-            if module_name not in _loaded_mmproj_modules:
-                try:
-                    __import__(f"conversion.{module_name}")
-                    _loaded_mmproj_modules.add(module_name)
-                except Exception as e:
-                    logger.warning(f"Failed to load model module {module_name}: {e}")
-
-
-def get_model_class(name: str, mmproj: bool = False) -> Type[ModelBase]:
-    """Dynamically import and return a model class by its HuggingFace architecture name."""
-    relevant_map = MMPROJ_MODEL_MAP if mmproj else TEXT_MODEL_MAP
-    if name not in relevant_map:
-        raise NotImplementedError(f"Architecture {name!r} not supported!")
-    module_name = relevant_map[name]
-    __import__(f"conversion.{module_name}")
-    model_type = ModelType.MMPROJ if mmproj else ModelType.TEXT
-    return ModelBase._model_classes[model_type][name]
-
-
-def print_registered_models() -> None:
-    load_all_models()
-    logger.error("TEXT models:")
-    for name in sorted(TEXT_MODEL_MAP.keys()):
-        logger.error(f"  - {name}")
-    logger.error("MMPROJ models:")
-    for name in sorted(MMPROJ_MODEL_MAP.keys()):
-        logger.error(f"  - {name}")
--- a/conversion/afmoe.py
+++ b/conversion/afmoe.py
@@ -1,79 +0,0 @@
-from __future__ import annotations
-
-from typing import Callable, Iterable, TYPE_CHECKING
-
-import torch
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, gguf
-
-from .llama import LlamaModel
-
-
-@ModelBase.register("AfmoeForCausalLM")
-class AfmoeModel(LlamaModel):
-    model_arch = gguf.MODEL_ARCH.AFMOE
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        # MoE parameters
-        if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None:
-            self.gguf_writer.add_expert_shared_count(n_shared_experts)
-        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
-            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
-        if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None:
-            self.gguf_writer.add_leading_dense_block_count(n_dense_layers)
-
-        # Route normalization and scaling
-        if (route_norm := self.hparams.get("route_norm")) is not None:
-            self.gguf_writer.add_expert_weights_norm(route_norm)
-        if (route_scale := self.hparams.get("route_scale")) is not None:
-            self.gguf_writer.add_expert_weights_scale(route_scale)
-
-        # Sliding window attention
-        if (sliding_window := self.hparams.get("sliding_window")) is not None:
-            self.gguf_writer.add_sliding_window(sliding_window)
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        if name.endswith(".expert_bias"):
-            name = name.replace(".expert_bias", ".expert_bias.bias")
-
-        return super().filter_tensors((name, gen))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Handle expert weights - they're already merged in the HF format
-        # process the experts separately
-        if name.find("mlp.experts") != -1:
-            n_experts = self.find_hparam(["num_local_experts", "num_experts"])
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                # merge the experts into a single 3d tensor
-                for w_name in ["gate_proj", "up_proj", "down_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename_to_retrieve])
-                        del self._experts[bid][ename_to_retrieve]
-
-                    data_torch = torch.stack(datas, dim=0)
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-                    yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid)
-
-                return
-            else:
-                return
-
-        yield from ModelBase.modify_tensors(self, data_torch, name, bid)
--- a/conversion/arctic.py
+++ b/conversion/arctic.py
@@ -1,162 +0,0 @@
-from __future__ import annotations
-
-import json
-import sys
-
-from typing import Iterable, TYPE_CHECKING
-
-import torch
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger
-
-from .llama import LlamaModel
-
-
-@ModelBase.register("ArcticForCausalLM")
-class ArcticModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.ARCTIC
-
-    def set_vocab(self):
-        # The reason for using a custom implementation here is that the
-        # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
-        # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
-        from sentencepiece import SentencePieceProcessor
-
-        tokenizer_path = self.dir_model / 'tokenizer.model'
-
-        if not tokenizer_path.is_file():
-            logger.error(f'Error: Missing {tokenizer_path}')
-            sys.exit(1)
-
-        # Read the whole vocabulary from the tokenizer.model file
-        tokenizer = SentencePieceProcessor()
-        tokenizer.LoadFromFile(str(tokenizer_path))
-
-        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
-
-        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
-        scores: list[float] = [-10000.0] * vocab_size
-        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
-
-        for token_id in range(tokenizer.vocab_size()):
-
-            piece = tokenizer.IdToPiece(token_id)
-            text = piece.encode("utf-8")
-            score = tokenizer.GetScore(token_id)
-
-            toktype = SentencePieceTokenTypes.NORMAL
-            if tokenizer.IsUnknown(token_id):
-                toktype = SentencePieceTokenTypes.UNKNOWN
-            elif tokenizer.IsControl(token_id):
-                toktype = SentencePieceTokenTypes.CONTROL
-            elif tokenizer.IsUnused(token_id):
-                toktype = SentencePieceTokenTypes.UNUSED
-            elif tokenizer.IsByte(token_id):
-                toktype = SentencePieceTokenTypes.BYTE
-
-            tokens[token_id] = text
-            scores[token_id] = score
-            toktypes[token_id] = toktype
-
-        # Use the added_tokens_decoder field from tokeniser_config.json as the source
-        # of information about added/redefined tokens and modify them accordingly.
-        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-
-                if "added_tokens_decoder" in tokenizer_config_json:
-                    added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
-                    for token_id, token_json in added_tokens_decoder.items():
-                        token_id = int(token_id)
-                        if token_id >= vocab_size:
-                            logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
-                            continue
-
-                        token_content = token_json["content"]
-                        token_type = SentencePieceTokenTypes.USER_DEFINED
-                        token_score = -10000.0
-
-                        # Map unk_token to UNKNOWN, other special tokens to CONTROL
-                        # Set the score to 0.0 as in the original tokenizer.model
-                        if ("special" in token_json) and token_json["special"]:
-                            if token_content == tokenizer_config_json["unk_token"]:
-                                token_type = SentencePieceTokenTypes.UNKNOWN
-                            else:
-                                token_type = SentencePieceTokenTypes.CONTROL
-                            token_score = 0.0
-
-                        logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
-                        tokens[token_id] = token_content.encode("utf-8")
-                        toktypes[token_id] = token_type
-                        scores[token_id] = token_score
-
-        self.gguf_writer.add_tokenizer_model("llama")
-        self.gguf_writer.add_tokenizer_pre("default")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_head = self.hparams["num_attention_heads"]
-        n_kv_head = self.hparams.get("num_key_value_heads")
-
-        if name.endswith("q_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-        if name.endswith("k_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
-
-        # process the experts separately
-        if name.find("block_sparse_moe.experts") != -1:
-            n_experts = self.hparams["num_local_experts"]
-
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                # merge the experts into a single 3d tensor
-                for wid in ["w1", "w2", "w3"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
-
-                    yield from super().modify_tensors(data_torch, merged_name, bid)
-                return
-            else:
-                return
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
--- a/conversion/baichuan.py
+++ b/conversion/baichuan.py
@@ -1,59 +0,0 @@
-from __future__ import annotations
-
-from typing import Iterable, TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, TextModel, gguf, logger
-
-
-@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
-class BaichuanModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.BAICHUAN
-
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        head_count = self.hparams["num_attention_heads"]
-        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
-
-        if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight":
-            logger.info(f"Unpacking and permuting layer {bid}")
-            yield from [
-                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid),
-                    self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)),
-                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid),
-                    self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)),
-                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid),
-                    self._reverse_hf_part(data_torch, 2)),
-            ]
-        else:
-            yield from self.modify_tensors(data_torch, self.map_tensor_name(name), bid)
-
-    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
-        if n_kv_head is not None and n_head != n_kv_head:
-            n_head //= n_kv_head
-
-        return (
-            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-            .swapaxes(1, 2)
-            .reshape(weights.shape)
-        )
-
-    def _reverse_hf_permute_part(
-        self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
-    ) -> Tensor:
-        r = weights.shape[0] // 3
-        return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
-
-    def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
-        r = weights.shape[0] // 3
-        return weights[r * n_part:r * n_part + r, ...]
--- a/conversion/bailingmoe.py
+++ b/conversion/bailingmoe.py
@@ -1,216 +0,0 @@
-from __future__ import annotations
-
-from typing import Callable, Iterable, TYPE_CHECKING
-
-import torch
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, TextModel, gguf
-
-
-@ModelBase.register("BailingMoeForCausalLM")
-class BailingMoeModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.BAILINGMOE
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        if (rope_dim := hparams.get("head_dim")) is None:
-            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
-
-        self.gguf_writer.add_rope_dimension_count(rope_dim)
-        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
-        self.gguf_writer.add_expert_weights_scale(1.0)
-        self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
-        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    @staticmethod
-    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
-        if n_head_kv is not None and n_head != n_head_kv:
-            n_head = n_head_kv
-        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                .swapaxes(1, 2)
-                .reshape(weights.shape))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_head = self.hparams["num_attention_heads"]
-        n_kv_head = self.hparams.get("num_key_value_heads")
-        n_embd = self.hparams["hidden_size"]
-        if (head_dim := self.hparams.get("head_dim")) is None:
-            head_dim = n_embd // n_head
-
-        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
-
-        if name.endswith("attention.dense.weight"):
-            yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid)
-            return
-        elif name.endswith("query_key_value.weight"):
-            q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
-
-            yield from super().modify_tensors(BailingMoeModel.permute(q, n_head, n_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid)
-            yield from super().modify_tensors(BailingMoeModel.permute(k, n_head, n_kv_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid)
-            yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid)
-            return
-        elif name.find("mlp.experts") != -1:
-            n_experts = self.find_hparam(["num_local_experts", "num_experts"])
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                # merge the experts into a single 3d tensor
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    yield from super().modify_tensors(data_torch, new_name, bid)
-
-            return
-
-        new_name = self.map_tensor_name(name)
-
-        if new_name == output_name and self.hparams.get("norm_head"):
-            data_torch = data_torch.float()
-            data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
-
-        yield from super().modify_tensors(data_torch, new_name, bid)
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
-@ModelBase.register("BailingMoeV2ForCausalLM")
-class BailingMoeV2Model(TextModel):
-    model_arch = gguf.MODEL_ARCH.BAILINGMOE2
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0):
-            self.block_count = self.hparams["num_hidden_layers"] + nextn_layers
-            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        if (rope_dim := hparams.get("head_dim")) is None:
-            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
-
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
-        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
-        self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"]))
-        self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
-        self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
-        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
-
-        if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
-            self.gguf_writer.add_nextn_predict_layers(nextn_layers)
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        if name.endswith(".expert_bias"):
-            name = name.replace(".expert_bias", ".expert_bias.bias")
-
-        return super().filter_tensors((name, gen))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if "mlp.experts" in name:
-            n_experts = self.find_hparam(["num_local_experts", "num_experts"])
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                # merge the experts into a single 3d tensor
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-
-                    yield from super().modify_tensors(data_torch, merged_name, bid)
-            return
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
-@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
-class SarvamMoEModel(BailingMoeV2Model):
-    model_arch = gguf.MODEL_ARCH.BAILINGMOE2
-    # Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
-    #  - full rotary (no partial_rotary_factor)
-    #  - expert bias is zero-mean normalized at load time
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        if (rope_dim := hparams.get("head_dim")) is None:
-            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
-        # Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
-        self.gguf_writer.add_rope_dimension_count(rope_dim)
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-        if name.endswith(".expert_bias"):
-            # Sarvam normalizes expert bias to zero mean
-            inner = gen
-
-            def gen():
-                t = inner()
-                return t - t.mean()
-        return super().filter_tensors((name, gen))
--- a/conversion/base.py
+++ b/conversion/base.py
--- a/conversion/bert.py
+++ b/conversion/bert.py
@@ -1,616 +0,0 @@
-from __future__ import annotations
-
-import json
-import os
-
-from pathlib import Path
-from typing import Any, Callable, Iterable, TYPE_CHECKING
-
-import torch
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger
-
-
-@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
-class BertModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.vocab_size = None
-
-        if cls_out_labels := self.hparams.get("id2label"):
-            if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0":
-                # Remove dummy labels added by AutoConfig
-                cls_out_labels = None
-        self.cls_out_labels = cls_out_labels
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_causal_attention(False)
-        self._try_set_pooling_type()
-
-        if self.cls_out_labels:
-            self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
-
-    def set_vocab(self):
-        tokens, toktypes, tokpre = self.get_vocab_base()
-        self.vocab_size = len(tokens)
-
-        # we need this to validate the size of the token_type embeddings
-        # though currently we are passing all zeros to the token_type embeddings
-        # "Sequence A" or "Sequence B"
-        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
-
-        # convert to phantom space vocab
-        def phantom(tok, toktype):
-            if toktype == gguf.TokenType.CONTROL:
-                return tok
-            if tok.startswith("##"):
-                return tok[2:]
-            return "\u2581" + tok
-        assert len(tokens) == len(toktypes)
-        tokens = list(map(phantom, tokens, toktypes))
-
-        # add vocab to gguf
-        self.gguf_writer.add_tokenizer_model("bert")
-        self.gguf_writer.add_tokenizer_pre(tokpre)
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        # handle special tokens
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        if name.startswith("bert."):
-            name = name[5:]
-
-        if name.endswith(".gamma"):
-            name = name[:-6] + ".weight"
-
-        if name.endswith(".beta"):
-            name = name[:-5] + ".bias"
-
-        # we are only using BERT for embeddings so we don't need the pooling layer
-        if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
-            return None
-
-        if name.startswith("cls.predictions"):
-            return None
-
-        if name.startswith("cls.seq_relationship"):
-            return None
-
-        return super().filter_tensors((name, gen))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if self.cls_out_labels:
-            # For BertForSequenceClassification (direct projection layer)
-            if name == "classifier.weight":
-                name = "classifier.out_proj.weight"
-
-            if name == "classifier.bias":
-                name = "classifier.out_proj.bias"
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-    def _xlmroberta_tokenizer_init(self) -> None:
-        # we need the pad_token_id to know how to chop down position_embd matrix
-        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
-            self._position_offset = 1 + pad_token_id
-            if "max_position_embeddings" in self.hparams:
-                self.hparams["max_position_embeddings"] -= self._position_offset
-        else:
-            self._position_offset = None
-
-    def _xlmroberta_set_vocab(self) -> None:
-        # to avoid TypeError: Descriptors cannot be created directly
-        # exception when importing sentencepiece_model_pb2
-        os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
-        from sentencepiece import SentencePieceProcessor
-        from sentencepiece import sentencepiece_model_pb2 as model
-
-        tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
-
-        tokenizer_json = {}
-        tokenizer_config_json = {}
-        if not tokenizer_path.is_file():
-            tokenizer_path = self.dir_model / 'tokenizer.json'
-            tokenizer_config_path = self.dir_model / 'tokenizer_config.json'
-
-            if not tokenizer_path.is_file():
-                raise FileNotFoundError(f"File not found: {tokenizer_path}")
-
-            from base64 import b64decode
-            from transformers import AutoTokenizer
-            tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
-
-            with open(tokenizer_path, "r", encoding="utf-8") as fp:
-                tokenizer_json = json.load(fp)
-
-            if tokenizer_config_path.is_file():
-                with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
-                    tokenizer_config_json = json.load(fp)
-
-            add_prefix = tokenizer.add_prefix_space  # ty: ignore[unresolved-attribute]
-            remove_whitespaces = tokenizer.clean_up_tokenization_spaces  # ty: ignore[unresolved-attribute]
-            precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
-
-            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)  # ty: ignore[unresolved-attribute]
-        else:
-            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
-            sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
-            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
-
-            add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
-            remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
-            precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
-
-            tokenizer = SentencePieceProcessor()
-            tokenizer.LoadFromFile(str(tokenizer_path))
-
-            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
-
-        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
-        scores: list[float] = [-10000.0] * vocab_size
-        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
-
-        if isinstance(tokenizer, SentencePieceProcessor):
-            for token_id in range(tokenizer.vocab_size()):
-                piece = tokenizer.IdToPiece(token_id)
-                text = piece.encode("utf-8")
-                score = tokenizer.GetScore(token_id)
-
-                toktype = SentencePieceTokenTypes.NORMAL
-                if tokenizer.IsUnknown(token_id):
-                    toktype = SentencePieceTokenTypes.UNKNOWN
-                elif tokenizer.IsControl(token_id):
-                    toktype = SentencePieceTokenTypes.CONTROL
-                elif tokenizer.IsUnused(token_id):
-                    toktype = SentencePieceTokenTypes.UNUSED
-                elif tokenizer.IsByte(token_id):
-                    toktype = SentencePieceTokenTypes.BYTE
-
-                tokens[token_id] = text
-                scores[token_id] = score
-                toktypes[token_id] = toktype
-        else:
-            added_vocab = tokenizer.get_added_vocab()  # ty: ignore[unresolved-attribute]
-            unk_token = tokenizer_config_json.get("unk_token")
-            unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))  # ty: ignore[no-matching-overload]
-
-            for token_id in range(tokenizer.vocab_size):  # ty: ignore[unresolved-attribute]
-                piece = tokenizer._convert_id_to_token(token_id)  # ty: ignore[unresolved-attribute]
-                if (piece := tokenizer._convert_id_to_token(token_id)) is not None:  # ty: ignore[unresolved-attribute]
-                    text = piece.encode("utf-8")
-                    score = tokenizer_json["model"]["vocab"][token_id][1]
-
-                    toktype = SentencePieceTokenTypes.NORMAL
-                    if token_id == unk_token_id:
-                        toktype = SentencePieceTokenTypes.UNKNOWN
-                    elif token_id in tokenizer.all_special_ids:  # ty: ignore[unresolved-attribute]
-                        toktype = SentencePieceTokenTypes.CONTROL
-                    elif token_id in added_vocab.values():
-                        toktype = SentencePieceTokenTypes.USER_DEFINED
-                    # No reliable way to detect this, but jina doesn't have any
-                    # elif tokenizer.IsByte(token_id):
-                    #     toktype = SentencePieceTokenTypes.BYTE
-
-                    tokens[token_id] = text
-                    scores[token_id] = score
-                    toktypes[token_id] = toktype
-
-        if isinstance(tokenizer, SentencePieceProcessor):
-            # realign tokens (see HF tokenizer code)
-            tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
-            scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
-            toktypes = [
-                SentencePieceTokenTypes.CONTROL,
-                SentencePieceTokenTypes.CONTROL,
-                SentencePieceTokenTypes.CONTROL,
-                SentencePieceTokenTypes.UNKNOWN,
-            ] + toktypes[3:-1]
-
-            if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
-                # Add mask token missing from sentencepiece.bpe.model
-                tokens[250001] = b'<mask>'
-                scores[250001] = 0.0
-                toktypes[250001] = SentencePieceTokenTypes.CONTROL
-
-        self.gguf_writer.add_tokenizer_model("t5")
-        self.gguf_writer.add_tokenizer_pre("default")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-        self.gguf_writer.add_add_space_prefix(add_prefix)
-        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
-        self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
-        if precompiled_charsmap:
-            self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-
-@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
-class DistilBertModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-
-    def set_gguf_parameters(self):
-        self.gguf_writer.add_layer_norm_eps(1e-12)
-        logger.info("gguf: layer norm epsilon = 1e-12")
-        super().set_gguf_parameters()
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        if name.startswith("distilbert."):
-            name = name[11:]
-
-        # These layers act as MLM head, so we don't need them
-        if name.startswith("vocab_"):
-            return None
-
-        return super().filter_tensors((name, gen))
-
-
-@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
-class RobertaModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # we need the pad_token_id to know how to chop down position_embd matrix
-        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
-            self._position_offset = 1 + pad_token_id
-            if "max_position_embeddings" in self.hparams:
-                self.hparams["max_position_embeddings"] -= self._position_offset
-        else:
-            self._position_offset = None
-
-    def set_vocab(self):
-        """Support BPE tokenizers for roberta models"""
-        bpe_tok_path = self.dir_model / "tokenizer.json"
-        if bpe_tok_path.exists():
-            self._set_vocab_gpt2()
-
-            # we need this to validate the size of the token_type embeddings
-            # though currently we are passing all zeros to the token_type embeddings
-            # "Sequence A" or "Sequence B"
-            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
-
-        else:
-            return super().set_vocab()
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        # if name starts with "roberta.", remove the prefix
-        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
-        if name.startswith("roberta."):
-            name = name[8:]
-
-        return super().filter_tensors((name, gen))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
-        if name == "embeddings.position_embeddings.weight":
-            if self._position_offset is not None:
-                data_torch = data_torch[self._position_offset:,:]
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("NomicBertModel")
-class NomicBertModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
-        hparams = kwargs.pop("hparams", None)
-        if hparams is None:
-            hparams = ModelBase.load_hparams(dir_model, False)
-
-        self.is_moe = bool(hparams.get("moe_every_n_layers"))
-        self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
-
-        super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
-
-        self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
-        if self._tokenizer_is_xlmroberta:
-            self._xlmroberta_tokenizer_init()
-
-        npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048)
-        if npos == 8192 and mtp == 2048:
-            self.hparams["n_positions"] = 2048  # nomic-embed-text v1 and v1.5 are trained for 2048 tokens.
-        elif npos == 2048 and mtp == 2048:
-            self.hparams["n_positions"] = 512   # nomic-embed-text-v2-moe is trained for 512 tokens.
-        else:
-            raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}")
-
-        assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
-
-        # this doesn't do anything in the HF version
-        assert self.hparams["causal"] is False
-        # no bias tensors unless MoE
-        assert self.hparams["qkv_proj_bias"] == self.is_moe
-        assert self.hparams["mlp_fc1_bias"]  == self.is_moe
-        assert self.hparams["mlp_fc2_bias"]  == self.is_moe
-
-        # norm at end of layer
-        assert self.hparams["prenorm"] is False
-        # standard RoPE
-        assert self.hparams["rotary_emb_fraction"] == 1.0
-        assert self.hparams["rotary_emb_interleaved"] is False
-        assert self.hparams["rotary_emb_scale_base"] is None
-
-    def set_vocab(self) -> None:
-        if self._tokenizer_is_xlmroberta:
-            return self._xlmroberta_set_vocab()
-        return super().set_vocab()
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        # If the tensor is an experts bias tensor, skip it.
-        if "mlp.experts.bias" in name:
-            return None
-
-        return super().filter_tensors(item)
-
-    def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
-        n_experts = self.find_hparam(["num_local_experts", "num_experts"])
-        if "mlp.experts.mlp.w1" in name:
-            data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"])
-            name += ".weight"
-
-        if "mlp.experts.mlp.w2" in name:
-            data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"])
-            data_torch = data_torch.transpose(1, 2)
-            name += ".weight"
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        if self.is_moe:
-            self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
-            self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
-
-    def _is_tokenizer_xlmroberta(self) -> bool:
-        with open(self.dir_model / "tokenizer.json") as f:
-            tokenizer_json = json.load(f)
-        toktyp = tokenizer_json["model"]["type"]
-        if toktyp == "Unigram":
-            return True
-        if toktyp == "WordPiece":
-            return False
-        raise ValueError(f"unknown tokenizer: {toktyp}")
-
-
-@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
-class NeoBert(BertModel):
-    model_arch = gguf.MODEL_ARCH.NEO_BERT
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        # NeoBERT uses 2/3 of the intermediate size as feed forward length
-        self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
-        self.gguf_writer.add_rope_freq_base(10000.0)  # default value for NeoBERT
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-
-        f_rms_eps = self.hparams.get("norm_eps", 1e-6)  # default value for NeoBERT
-        self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
-        logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
-
-        self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        if name.startswith("decoder."):
-            return None
-
-        if name.startswith("model."):
-            name = name[6:]
-
-        return super().filter_tensors((name, gen))
-
-
-@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model")
-class EuroBertModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.EUROBERT
-
-    def set_vocab(self):
-        self.gguf_writer.add_add_bos_token(False)
-        self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        # EuroBert is bidirectional (encoder)
-        self.gguf_writer.add_causal_attention(False)
-
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-
-        self._try_set_pooling_type()
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        if name.startswith("model."):
-            name = name[6:]
-
-        return super().filter_tensors((name, gen))
-
-
-@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
-class XLMRobertaModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-    _lora_files = {}
-    _lora_names = []
-
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
-        hparams = kwargs.pop("hparams", None)
-        if hparams is None:
-            hparams = ModelBase.load_hparams(dir_model, False)
-
-        if lora_names := hparams.get("lora_adaptations"):
-            self._lora_names = lora_names
-            self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
-
-        super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
-        self._xlmroberta_tokenizer_init()
-
-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        if self._lora_names:
-            for name in self._lora_names:
-                fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-")
-                self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run)
-
-        return super().generate_extra_tensors()
-
-    def set_type(self):
-        for lora_writer in self._lora_files.values():
-            lora_writer.add_type(gguf.GGUFType.ADAPTER)
-            lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
-        super().set_type()
-
-    def set_vocab(self):
-        self._xlmroberta_set_vocab()
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        # if name starts with "roberta.", remove the prefix
-        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
-        if name.startswith("roberta."):
-            name = name[8:]
-
-        # jina-embeddings-v3
-        if ".parametrizations." in name:
-            name = name.replace(".parametrizations.", ".")
-            if name.endswith(".original"):
-                name = name[:-9]
-
-        return super().filter_tensors((name, gen))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
-        if name == "embeddings.position_embeddings.weight":
-            if self._position_offset is not None:
-                data_torch = data_torch[self._position_offset:,:]
-
-        if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"):
-            if name.startswith("pooler.dense"):
-                return
-
-            num_loras = data_torch.size(0)
-            assert num_loras == len(self._lora_names)
-
-            # Split out each LoRA in their own GGUF
-            for i, lora_writer in enumerate(self._lora_files.values()):
-                new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower()
-                data = data_torch[i, :, :]
-                # Transpose/flip token_embd/types into correct shape
-                if new_name == "token_embd.weight.lora_b":
-                    data = data.T
-                elif new_name.startswith("token_types.weight."):
-                    new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b")
-                lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32)
-
-            return
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        # jina-embeddings-v3
-        lora_alpha = self.hparams.get("lora_alpha")
-        if lora_prompt_prefixes := self.hparams.get("task_instructions"):
-            assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
-        for lora_name, lora_writer in self._lora_files.items():
-            lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0)
-            lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name)
-            if lora_prompt_prefixes:
-                lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name])
-
-    def write(self):
-        super().write()
-        for lora_writer in self._lora_files.values():
-            lora_writer.write_header_to_file()
-            lora_writer.write_kv_data_to_file()
-            lora_writer.write_tensors_to_file(progress=True)
-            lora_writer.close()
-
-
-@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM")
-class JinaBertV2Model(BertModel):
-    model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
-
-    def set_vocab(self):
-        tokenizer_class = 'BertTokenizer'
-        with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
-            tokenizer_class = json.load(f)['tokenizer_class']
-
-        if tokenizer_class == 'BertTokenizer':
-            super().set_vocab()
-        elif tokenizer_class == 'RobertaTokenizer':
-            self._set_vocab_gpt2()
-            self.gguf_writer.add_token_type_count(2)
-        else:
-            raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
-
-
-@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
-class ModernBertModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.MODERN_BERT
-
-    def set_vocab(self):
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(True)
-        self.gguf_writer.add_add_sep_token(True)
-        self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
-        if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
-            self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        if name.startswith("model."):
-            name = name[6:]
-
-        return super().filter_tensors((name, gen))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if self.cls_out_labels:
-            # For BertForSequenceClassification (direct projection layer)
-            if name == "classifier.weight":
-                name = "classifier.out_proj.weight"
-
-            if name == "classifier.bias":
-                name = "classifier.out_proj.bias"
-
-        yield from super().modify_tensors(data_torch, name, bid)
--- a/conversion/bitnet.py
+++ b/conversion/bitnet.py
@@ -1,49 +0,0 @@
-from __future__ import annotations
-
-from typing import Iterable, TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, TextModel, gguf
-
-
-@ModelBase.register("BitnetForCausalLM")
-class BitnetModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.BITNET
-
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-        self.gguf_writer.add_rope_scaling_factor(1.0)
-
-    def weight_quant(self, weight: Tensor) -> Tensor:
-        dtype = weight.dtype
-        weight = weight.float()
-        scale = weight.abs().mean().clamp(min=1e-5)
-        iscale = 1 / scale
-        # TODO: multiply by the scale directly instead of inverting it twice
-        # (this is also unnecessarily doubly inverted upstream)
-        # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
-        result = (weight * iscale).round().clamp(-1, 1) / iscale
-        return result.type(dtype)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        new_name = self.map_tensor_name(name)
-
-        if any(self.match_model_tensor_name(new_name, key, bid) for key in [
-            gguf.MODEL_TENSOR.ATTN_Q,
-            gguf.MODEL_TENSOR.ATTN_K,
-            gguf.MODEL_TENSOR.ATTN_V,
-            gguf.MODEL_TENSOR.ATTN_OUT,
-            gguf.MODEL_TENSOR.FFN_UP,
-            gguf.MODEL_TENSOR.FFN_DOWN,
-            gguf.MODEL_TENSOR.FFN_GATE,
-        ]):
-            # transform weight into 1/0/-1 (in fp32)
-            data_torch = self.weight_quant(data_torch)
-
-        yield from super().modify_tensors(data_torch, name, bid)
--- a/conversion/bloom.py
+++ b/conversion/bloom.py
@@ -1,67 +0,0 @@
-from __future__ import annotations
-
-import re
-
-from typing import Iterable, TYPE_CHECKING
-
-import torch
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, TextModel, gguf, logger
-
-
-@ModelBase.register("BloomForCausalLM", "BloomModel")
-class BloomModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.BLOOM
-
-    def set_gguf_parameters(self):
-        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
-        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
-        assert n_head is not None
-        assert n_embed is not None
-        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
-        self.gguf_writer.add_embedding_length(n_embed)
-        self.gguf_writer.add_feed_forward_length(4 * n_embed)
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_head_count(n_head)
-        self.gguf_writer.add_head_count_kv(n_head)
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
-        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
-        assert n_head is not None
-        assert n_embed is not None
-
-        name = re.sub(r'transformer\.', '', name)
-
-        if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
-            # Map bloom-style qkv_linear to gpt-style qkv_linear
-            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
-            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
-            qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
-            data_torch = torch.cat(
-                (
-                    qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
-                    qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
-                    qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
-                ),
-                dim=0,
-            )
-            logger.info("re-format attention.linear_qkv.weight")
-        elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
-            qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
-            data_torch = torch.cat(
-                (
-                    qkv_bias[:, 0, :].reshape((n_embed,)),
-                    qkv_bias[:, 1, :].reshape((n_embed,)),
-                    qkv_bias[:, 2, :].reshape((n_embed,)),
-                ),
-                dim=0,
-            )
-            logger.info("re-format attention.linear_qkv.bias")
-
-        yield from super().modify_tensors(data_torch, name, bid)
--- a/conversion/chameleon.py
+++ b/conversion/chameleon.py
@@ -1,58 +0,0 @@
-from __future__ import annotations
-
-from typing import Callable, Iterable, TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, TextModel, gguf
-
-from .llama import LlamaModel
-
-
-@ModelBase.register("ChameleonForConditionalGeneration")
-@ModelBase.register("ChameleonForCausalLM")  # obsolete
-class ChameleonModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.CHAMELEON
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        # ignore image tokenizer for now
-        # TODO: image support for Chameleon
-        if name.startswith("model.vqmodel"):
-            return None
-
-        return super().filter_tensors(item)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_head = self.hparams["num_attention_heads"]
-        n_kv_head = self.hparams.get("num_key_value_heads")
-        hidden_dim = self.hparams.get("hidden_size")
-
-        if name.endswith(("q_proj.weight", "q_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-        if name.endswith(("k_proj.weight", "k_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
-        if name.endswith(("q_norm.weight", "q_norm.bias")):
-            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
-        if name.endswith(("k_norm.weight", "k_norm.bias")):
-            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-    # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
-    @staticmethod
-    def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
-        head_dim = hidden_dim // n_heads
-        data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
-        data_torch = data_torch.repeat_interleave(n_heads, 0)
-        return data_torch
--- a/conversion/chatglm.py
+++ b/conversion/chatglm.py
@@ -1,167 +0,0 @@
-from __future__ import annotations
-
-from typing import Callable, TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf
-
-
-@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
-class ChatGLMModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.CHATGLM
-
-    def set_vocab_chatglm3(self):
-        dir_model = self.dir_model
-        hparams = self.hparams
-        tokens: list[bytes] = []
-        toktypes: list[int] = []
-        scores: list[float] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))  # ty: ignore[unresolved-attribute]
-        assert max(tokenizer.get_vocab().values()) < vocab_size  # ty: ignore[unresolved-attribute]
-        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
-        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
-        for token_id in range(vocab_size):
-            piece = tokenizer._convert_id_to_token(token_id)  # ty: ignore[unresolved-attribute]
-            if token_id == 0:
-                piece = "<unk>"
-            elif token_id == 1:
-                piece = "<bos>"
-            elif token_id == 2:
-                piece = "<eos>"
-
-            text = piece.encode("utf-8")  # ty: ignore[unresolved-attribute]
-            score = 0.0
-            # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
-            # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
-            if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():  # ty: ignore[unresolved-attribute, invalid-argument-type]
-                score = tokenizer.tokenizer.sp_model.get_score(token_id)  # ty: ignore[unresolved-attribute]
-
-            if token_id >= tokenizer.tokenizer.sp_model.vocab_size():  # ty: ignore[unresolved-attribute]
-                if piece in special_tokens:
-                    toktype = SentencePieceTokenTypes.CONTROL
-                elif len(piece) == 0:  # ty: ignore[invalid-argument-type]
-                    text = f"[PAD{token_id}]".encode("utf-8")
-                    toktype = SentencePieceTokenTypes.UNUSED
-                else:
-                    toktype = SentencePieceTokenTypes.USER_DEFINED
-                tokens.append(text)
-                scores.append(score)
-                toktypes.append(toktype)
-                continue
-
-            toktype = SentencePieceTokenTypes.NORMAL
-            if tokenizer.tokenizer.sp_model.is_unknown(token_id):  # ty: ignore[unresolved-attribute]
-                toktype = SentencePieceTokenTypes.UNKNOWN
-            elif tokenizer.tokenizer.sp_model.is_control(token_id):  # ty: ignore[unresolved-attribute]
-                toktype = SentencePieceTokenTypes.CONTROL
-            elif tokenizer.tokenizer.sp_model.is_unused(token_id):  # ty: ignore[unresolved-attribute]
-                toktype = SentencePieceTokenTypes.UNUSED
-            elif tokenizer.tokenizer.sp_model.is_byte(token_id):  # ty: ignore[unresolved-attribute]
-                toktype = SentencePieceTokenTypes.BYTE
-
-            tokens.append(text)
-            scores.append(score)
-            toktypes.append(toktype)
-
-        self.gguf_writer.add_tokenizer_model("llama")
-        # glm3 needs prefix and suffix formatted as:
-        # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
-        self.gguf_writer.add_tokenizer_pre("chatglm-spm")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    @staticmethod
-    def token_bytes_to_string(b):
-        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode  # ty: ignore[unresolved-import]
-        byte_encoder = bytes_to_unicode()
-        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
-
-    @staticmethod
-    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
-        parts = [bytes([b]) for b in token]
-        while True:
-            min_idx = None
-            min_rank = None
-            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
-                rank = mergeable_ranks.get(pair[0] + pair[1])
-                if rank is not None and (min_rank is None or rank < min_rank):
-                    min_idx = i
-                    min_rank = rank
-            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
-                break
-            assert min_idx is not None
-            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
-        return parts
-
-    def set_vocab(self):
-        if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
-            self.set_vocab_chatglm3()
-            return
-
-        dir_model = self.dir_model
-        hparams = self.hparams
-        tokens: list[str] = []
-        toktypes: list[int] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
-        assert max(tokenizer.get_vocab().values()) < vocab_size  # ty: ignore[unresolved-attribute]
-
-        tokens, toktypes, tokpre = self.get_vocab_base()
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_tokenizer_pre(tokpre)
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
-        # only add special tokens when they were not already loaded from config.json
-        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])  # ty: ignore[unresolved-attribute]
-        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])  # ty: ignore[unresolved-attribute]
-        # this one is usually not in config.json anyway
-        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])  # ty: ignore[unresolved-attribute]
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def set_gguf_parameters(self):
-        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
-        assert n_embed is not None
-        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
-        assert n_head is not None
-        n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
-        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
-        self.gguf_writer.add_embedding_length(n_embed)
-        self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_head_count(n_head)
-        self.gguf_writer.add_head_count_kv(n_head_kv)
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
-        self.gguf_writer.add_file_type(self.ftype)
-        if "attention_dim" in self.hparams:
-            rope_dim = self.hparams["attention_dim"]
-        else:
-            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
-        self.gguf_writer.add_add_bos_token(False)
-        rope_freq = 10000
-        if "rope_ratio" in self.hparams:
-            rope_freq = rope_freq * self.hparams["rope_ratio"]
-        self.gguf_writer.add_rope_freq_base(rope_freq)
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        if name.endswith(".rotary_pos_emb.inv_freq"):
-            return None
-
-        name = name.removeprefix("transformer.")
-
-        return super().filter_tensors((name, gen))
--- a/conversion/codeshell.py
+++ b/conversion/codeshell.py
@@ -1,21 +0,0 @@
-from __future__ import annotations
-
-from .base import ModelBase, TextModel, gguf
-
-
-@ModelBase.register("CodeShellForCausalLM")
-class CodeShellModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.CODESHELL
-
-    def set_gguf_parameters(self):
-        self.gguf_writer.add_context_length(self.hparams["n_positions"])
-        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
-        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_head_count(self.hparams["n_head"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_rope_freq_base(10000.0)
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-        self.gguf_writer.add_rope_scaling_factor(1.0)
--- a/conversion/cogvlm.py
+++ b/conversion/cogvlm.py
@@ -1,33 +0,0 @@
-from __future__ import annotations
-
-from typing import Callable, TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import MmprojModel, ModelBase, gguf
-
-from .llama import LlamaModel
-
-
-@ModelBase.register("CogVLMForCausalLM")
-class CogVLMVisionModel(MmprojModel):
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        if not name.startswith("model.vision."):
-            return None
-
-        return super().filter_tensors(item)
-
-
-@ModelBase.register("CogVLMForCausalLM")
-class CogVLMModel(LlamaModel):
-    model_arch = gguf.MODEL_ARCH.COGVLM
--- a/conversion/command_r.py
+++ b/conversion/command_r.py
@@ -1,57 +0,0 @@
-from __future__ import annotations
-
-from typing import Iterable, TYPE_CHECKING
-
-import torch
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, TextModel, gguf, logger
-
-
-@ModelBase.register("CohereForCausalLM")
-class CommandR2Model(TextModel):
-    model_arch = gguf.MODEL_ARCH.COMMAND_R
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # max_position_embeddings = 8192 in config.json but model was actually
-        # trained on 128k context length
-        # aya-23 models don't have model_max_length specified
-        self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-
-
-@ModelBase.register("Cohere2ForCausalLM")
-class Cohere2Model(TextModel):
-    model_arch = gguf.MODEL_ARCH.COHERE2
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
-        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
-        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-
-        rotary_pct = self.hparams["rotary_pct"]
-        hidden_size = self.hparams["hidden_size"]
-        num_attention_heads = self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Cohere2 runtime in llama.cpp expects no bias tensors;
-        # the actual weight only contains 0-value tensors as bias, we can skip them
-        if name.endswith(".bias"):
-            if torch.any(data_torch != 0):
-                raise ValueError(f"Bias tensor {name!r} is not zero.")
-            logger.debug(f"Skipping bias tensor {name!r} for Cohere2 conversion.")
-            return
-
-        yield from super().modify_tensors(data_torch, name, bid)
--- a/conversion/dbrx.py
+++ b/conversion/dbrx.py
@@ -1,75 +0,0 @@
-from __future__ import annotations
-
-from typing import Iterable, TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-from .base import ModelBase, TextModel, gguf, logger
-
-
-@ModelBase.register("DbrxForCausalLM")
-class DbrxModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.DBRX
-
-    def set_gguf_parameters(self):
-        ffn_config = self.hparams["ffn_config"]
-        attn_config = self.hparams["attn_config"]
-        self.gguf_writer.add_block_count(self.block_count)
-
-        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
-        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
-        self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
-
-        self.gguf_writer.add_head_count(self.hparams["n_heads"])
-        self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
-
-        self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
-
-        self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
-
-        self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
-        self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
-
-        self.gguf_writer.add_layer_norm_eps(1e-5)
-
-        self.gguf_writer.add_file_type(self.ftype)
-        logger.info(f"gguf: file type = {self.ftype}")
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_expert = self.hparams["ffn_config"]["moe_num_experts"]
-        n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
-        n_embd = self.hparams["d_model"]
-
-        # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
-        # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
-        # But llama.cpp moe graph works differently
-        # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
-        # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
-        exp_tensor_names = {"ffn.experts.mlp.w1": None,       # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff,   n_expert}
-                            "ffn.experts.mlp.w2": (0, 2, 1),  # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff,   n_embd, n_expert}
-                            "ffn.experts.mlp.v1": None}       # LLM_TENSOR_FFN_UP_EXPS   ggml_tensor->ne{n_embd, n_ff,   n_expert}
-        experts = False
-
-        for exp_tensor_name in exp_tensor_names.keys():
-            if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
-                experts = True
-                data_torch = data_torch.view(n_expert, n_ff, n_embd)
-                if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
-                    data_torch = data_torch.permute(*permute_tensor)
-                break
-
-        # map tensor names
-        # In MoE models the ffn tensors are typically most of the model weights,
-        # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
-        # Every other model has the weight names ending in .weight,
-        # let's assume that is the convention which is not the case for dbrx:
-        # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
-        new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
-
-        yield from super().modify_tensors(data_torch, new_name, bid)
-
-    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
-        del name, new_name, bid  # unused
-
-        return n_dims > 1
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	35df147d80	cont : remove /api/tags	2026-04-20 15:45:42 +03:00
Georgi Gerganov	c1891fd6eb	server : remove /api endpoints	2026-04-20 15:34:18 +03:00