ggml-webgpu: improve i-quants mul_mat performance and speed up prefill (#24530 )

* Improve prefill speeds for i-quants * Fix #if defined() usage in preprocessor guards.
convert : fix lora base model arch retrieval (#24621 )
2026-06-15 07:15:45 +02:00 · 2026-06-14 18:15:30 -07:00 · 2026-06-15 00:55:26 +02:00 · 2026-06-14 22:56:56 +02:00 · 2026-06-14 20:42:16 +02:00 · 2026-06-14 20:17:40 +02:00
36 changed files with 1078 additions and 367 deletions
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

@@ -37,7 +37,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -3,9 +3,9 @@ ARG UBUNTU_VERSION=24.04
 ARG CUDA_VERSION=12.8.1
 ARG GCC_VERSION=14
 # Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_DEV_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_RUN_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -5,7 +5,7 @@ ARG APP_REVISION=N/A

 ## Build Image

-FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
+FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 ARG LEVEL_ZERO_VERSION=1.28.2
@@ -42,7 +42,7 @@ RUN mkdir -p /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

-FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
+FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ascendai/cann:$ASCEND_VERSION AS build
+FROM docker.io/ascendai/cann:$ASCEND_VERSION AS build

 WORKDIR /app

@@ -30,7 +30,7 @@ RUN echo "Building with static libs" && \
    cmake --build build --config Release --target llama-completion

 # TODO: use image with NNRT
-FROM ascendai/cann:$ASCEND_VERSION AS runtime
+FROM docker.io/ascendai/cann:$ASCEND_VERSION AS runtime

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -2,9 +2,9 @@ ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc4.3.0
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_DEV_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_RUN_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -23,7 +23,7 @@ ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

 ## Build Image
-FROM ubuntu:${UBUNTU_VERSION} AS build
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build

 # Pass proxy args to build stage
 ARG http_proxy
@@ -88,7 +88,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base Runtime Image
-FROM ubuntu:${UBUNTU_VERSION} AS base
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base

 # Pass proxy args to runtime stage
 ARG http_proxy
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -5,7 +5,7 @@ ARG ROCM_VERSION=7.2.1
 ARG AMDGPU_VERSION=7.2.1

 # Target the ROCm build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+ARG BASE_ROCM_DEV_CONTAINER=docker.io/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -5,7 +5,7 @@ ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

 ### Build Llama.cpp stage
-FROM gcc:${GCC_VERSION} AS build
+FROM docker.io/gcc:${GCC_VERSION} AS build

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
@@ -55,7 +55,7 @@ COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion


 ### Base image
-FROM ubuntu:${UBUNTU_VERSION} AS base
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils
@@ -33,7 +33,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/zendnn.Dockerfile
+++ b/.devops/zendnn.Dockerfile
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
@@ -30,7 +30,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -12,7 +12,7 @@ SYCL:
            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
-Nvidia GPU:
+CUDA:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-cuda.h
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -783,6 +783,8 @@ jobs:
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

  windows-sycl:
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: windows-2022

@@ -891,6 +893,8 @@ jobs:
          name: llama-bin-win-sycl-x64.zip

  ubuntu-24-sycl:
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    strategy:
      matrix:
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1979,6 +1979,146 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
    return data;
 }

+// Cohere2 MoE (a.k.a. "North Code") parser.
+//
+// The assistant turn is fully marker-wrapped:
+//   <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+//     <|START_THINKING|>{reasoning}<|END_THINKING|>
+//     then EITHER content:    <|START_TEXT|>{content}<|END_TEXT|>
+//          OR     tool calls: <|START_ACTION|>[
+//                                 {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ...
+//                             ]<|END_ACTION|>
+//   <|END_OF_TURN_TOKEN|>
+//
+// The generation prompt forces a leading <|START_THINKING|> (when reasoning is enabled, which is
+// the template default), so the model's output continues from *inside* the thinking block. The
+// parser literal therefore only covers the stable <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> prefix
+// and the reasoning rule consumes the <|START_THINKING|> ... <|END_THINKING|> markers itself,
+// regardless of whether they came from the generation prompt or the generated text.
+static common_chat_params common_chat_params_init_cohere2moe(const common_chat_template &          tmpl,
+                                                              const autoparser::generation_params & inputs) {
+    common_chat_params data;
+
+    const std::string TURN_START    = "<|START_OF_TURN_TOKEN|>";
+    const std::string TURN_END      = "<|END_OF_TURN_TOKEN|>";
+    const std::string CHATBOT       = "<|CHATBOT_TOKEN|>";
+    const std::string USER          = "<|USER_TOKEN|>";
+    const std::string SYSTEM        = "<|SYSTEM_TOKEN|>";
+    const std::string THINK_START   = "<|START_THINKING|>";
+    const std::string THINK_END     = "<|END_THINKING|>";
+    const std::string TEXT_START    = "<|START_TEXT|>";
+    const std::string TEXT_END      = "<|END_TEXT|>";
+    const std::string ACTION_START  = "<|START_ACTION|>";
+    const std::string ACTION_END    = "<|END_ACTION|>";
+    const std::string RESULT_START  = "<|START_TOOL_RESULT|>";
+    const std::string RESULT_END    = "<|END_TOOL_RESULT|>";
+
+    // Stable prefix of the generation prompt that precedes the (forced) <|START_THINKING|> marker.
+    const std::string GEN_PREFIX = TURN_START + CHATBOT;
+
+    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt  = common_chat_template_generation_prompt_impl(tmpl, inputs);
+    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking  = true;
+    data.thinking_start_tag = THINK_START;
+    data.thinking_end_tag   = THINK_END;
+    data.preserved_tokens   = {
+        TURN_START, TURN_END, CHATBOT, USER, SYSTEM,
+        THINK_START, THINK_END,
+        TEXT_START, TEXT_END,
+        ACTION_START, ACTION_END,
+        RESULT_START, RESULT_END,
+    };
+
+    // Split the rendered prompt into per-role message spans. Tool results are rendered with the
+    // system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before
+    // the plain "system" one (it is a strict superset, and the role split tries delimiters in order).
+    data.message_spans = common_chat_split_by_role(data.prompt, {
+        { "assistant", GEN_PREFIX },
+        { "user",      TURN_START + USER },
+        { "tool",      TURN_START + SYSTEM + RESULT_START },
+        { "system",    TURN_START + SYSTEM },
+    });
+
+    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
+
+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = GEN_PREFIX + THINK_START + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += THINK_END + TEXT_START + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        auto generation_prompt = p.literal(GEN_PREFIX);
+        auto end               = p.end();
+
+        // The thinking block is always present (the generation prompt forces <|START_THINKING|>).
+        // When extracting reasoning, capture its body; otherwise keep the whole block (markers
+        // included) inline as content, matching reasoning_format=NONE conventions.
+        common_peg_parser reasoning = p.eps();
+        if (extract_reasoning) {
+            reasoning = p.optional(p.literal(THINK_START) +
+                                   p.reasoning(p.until_one_of({ THINK_END, TEXT_START, ACTION_START })) +
+                                   p.optional(p.literal(THINK_END)));
+        } else {
+            reasoning = p.optional(p.content(p.literal(THINK_START) +
+                                             p.until_one_of({ THINK_END, TEXT_START, ACTION_START }) +
+                                             p.optional(p.literal(THINK_END))));
+        }
+
+        auto text_content = p.literal(TEXT_START) + p.content(p.until(TEXT_END)) + p.optional(p.literal(TEXT_END));
+
+        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
+            return generation_prompt + reasoning + text_content + p.optional(p.literal(TURN_END)) + end;
+        }
+
+        auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+        // <|START_ACTION|>[ {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ... ]<|END_ACTION|>
+        auto tool_calls = p.standard_json_tools(ACTION_START, ACTION_END, inputs.tools, inputs.parallel_tool_calls,
+                                                /* force_tool_calls = */ true,
+                                                /* name_key         = */ "tool_name",
+                                                /* args_key         = */ "parameters",
+                                                /* array_wrapped    = */ true,
+                                                /* function_is_key  = */ false,
+                                                /* call_id_key      = */ "",
+                                                /* gen_call_id_key  = */ "tool_call_id",
+                                                /* parameters_order = */ { "tool_call_id", "tool_name", "parameters" });
+
+        // Content and tool calls are mutually exclusive in this format.
+        common_peg_parser body = require_tools ? tool_calls : p.choice({ tool_calls, text_content });
+
+        return generation_prompt + reasoning + body + p.optional(p.literal(TURN_END)) + end;
+    });
+
+    data.parser = parser.save();
+
+    if (include_grammar) {
+        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                auto         schema   = function.at("parameters");
+                builder.resolve_refs(schema);
+            });
+            parser.build_grammar(builder, data.grammar_lazy);
+        });
+
+        data.grammar_triggers = {
+            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ACTION_START }
+        };
+    }
+
+    return data;
+}
+
 namespace workaround {

 static void map_developer_role_to_system(json & messages) {
@@ -2227,6 +2367,15 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

+    // Cohere2 MoE / North Code - marker-wrapped format with <|START_TEXT|> content and
+    // <|START_ACTION|> JSON tool calls. <|START_TEXT|> is unique to this template (the older
+    // Command-R templates use <|START_RESPONSE|>).
+    if (src.find("<|START_TEXT|>") != std::string::npos &&
+        src.find("<|START_ACTION|>") != std::string::npos) {
+        LOG_DBG("Using specialized template: Cohere2 MoE\n");
+        return common_chat_params_init_cohere2moe(tmpl, params);
+    }
+
    if (is_lfm2_template(src)) {
        LOG_DBG("Using specialized template: LFM2\n");
        return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true);
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@@ -316,12 +316,22 @@ value filter_expression::execute_impl(context & ctx) {

    JJ_DEBUG("Applying filter to %s", input->type().c_str());

+    auto set_filter_alias = [](auto & filter_id) {
+        if (filter_id == "count") {
+            filter_id = "length";
+        } else if (filter_id == "d") {
+            filter_id = "default";
+        } else if (filter_id == "e") {
+            filter_id = "escape";
+        } else if (filter_id == "trim") {
+            filter_id = "strip";
+        }
+    };
+
    if (is_stmt<identifier>(filter)) {
        auto filter_id = cast_stmt<identifier>(filter)->val;

-        if (filter_id == "trim") {
-            filter_id = "strip"; // alias
-        }
+        set_filter_alias(filter_id);
        JJ_DEBUG("Applying filter '%s' to %s", filter_id.c_str(), input->type().c_str());
        // TODO: Refactor filters so this coercion can be done automatically
        if (!input->is_undefined() && !is_val<value_string>(input) && (
@@ -345,9 +355,7 @@ value filter_expression::execute_impl(context & ctx) {
        }
        auto filter_id = cast_stmt<identifier>(call->callee)->val;

-        if (filter_id == "trim") {
-            filter_id = "strip"; // alias
-        }
+        set_filter_alias(filter_id);
        JJ_DEBUG("Applying filter '%s' with arguments to %s", filter_id.c_str(), input->type().c_str());
        func_args args(ctx);
        for (const auto & arg_expr : call->args) {
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -25,7 +25,7 @@ import gguf
 from gguf.constants import GGUFValueType

 # reuse model definitions from the conversion/ package
-from conversion import LazyTorchTensor, ModelBase, get_model_class
+from conversion import LazyTorchTensor, ModelBase, get_model_class, ModelType, get_model_architecture

 logger = logging.getLogger("lora-to-gguf")

@@ -396,12 +396,12 @@ if __name__ == '__main__':
        hparams = ModelBase.load_hparams(dir_base_model, False)

    with torch.inference_mode():
+        model_arch = get_model_architecture(hparams, ModelType.TEXT)
        try:
-            model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0]
-            logger.info("Using model architecture: %s", model_arch)
            model_class = get_model_class(model_arch)
+            logger.info("Using model architecture: %s", model_arch)
        except NotImplementedError:
-            logger.error(f"Model {hparams['architectures'][0]} is not supported")
+            logger.error(f"Model {model_arch} is not supported")
            sys.exit(1)

        class LoraModel(model_class):  # ty: ignore[unsupported-base]
--- a/docs/backend/CUDA-FEDORA.md
+++ b/docs/backend/CUDA-FEDORA.md
@@ -270,7 +270,7 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t

 ---

-**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox.
+**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with the toolbox.

 **Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide.

--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
@@ -98,6 +98,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
 }
 #endif // INIT_SRC0_SHMEM_Q1_0

+// legacy-quants
 #if defined(INIT_SRC0_SHMEM_Q4_0) || defined(INIT_SRC0_SHMEM_Q4_1) || defined(INIT_SRC0_SHMEM_Q5_0) || defined(INIT_SRC0_SHMEM_Q5_1) || defined(INIT_SRC0_SHMEM_Q8_0) || defined(INIT_SRC0_SHMEM_Q8_1) || defined(INIT_SRC0_SHMEM_MXFP4)
 const BLOCK_SIZE = 32u;
 // the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
@@ -124,7 +125,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        if (global_m < params.m && global_block_k < params.k / BLOCK_SIZE) {
            let src0_idx = batch_offset + global_m * params.stride_01 + global_block_k;

-#ifdef INIT_SRC0_SHMEM_Q4_0
+#if defined(INIT_SRC0_SHMEM_Q4_0)
            let block_byte_base = src0_idx * 18u; // BLOCK_SIZE_BYTES = 18u;
            let d = load_f16_at_src0(block_byte_base);

@@ -134,7 +135,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                let q_packed = load_u32_at_src0(q_byte_offset);
                dequant_q4_0_packed_to_shmem(q_packed, d, shmem_idx + j * BYTES_PER_INNER_LOOP);
            }
-#elif INIT_SRC0_SHMEM_Q4_1
+#endif // INIT_SRC0_SHMEM_Q4_0
+
+#if defined(INIT_SRC0_SHMEM_Q4_1)
            let block_byte_base = src0_idx * 20u; // BLOCK_SIZE_BYTES = 20u;
            let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base));
            let d = f16(dm[0]);
@@ -153,7 +156,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                    shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi;
                }
            }
-#elif INIT_SRC0_SHMEM_Q5_0
+#endif // INIT_SRC0_SHMEM_Q4_1
+
+#if defined(INIT_SRC0_SHMEM_Q5_0)
            let block_byte_base = src0_idx * 22u; // BLOCK_SIZE_BYTES = 22u;

            let d  = load_f16_at_src0(block_byte_base);
@@ -176,7 +181,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                    shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi;
                }
            }
-#elif INIT_SRC0_SHMEM_Q5_1
+#endif // INIT_SRC0_SHMEM_Q5_0
+
+#if defined(INIT_SRC0_SHMEM_Q5_1)
            let block_byte_base = src0_idx * 24u; // BLOCK_SIZE_BYTES = 24u;

            let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base));
@@ -201,7 +208,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                    shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi;
                }
            }
-#elif INIT_SRC0_SHMEM_Q8_0
+#endif // INIT_SRC0_SHMEM_Q5_1
+
+#if defined(INIT_SRC0_SHMEM_Q8_0)
            let block_byte_base = src0_idx * 34u; // BLOCK_SIZE_BYTES = 34u;
            let d = load_f16_at_src0(block_byte_base);

@@ -211,7 +220,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                let q_packed = load_u32_at_src0(q_byte_offset);
                dequant_q8_0_packed_to_shmem(q_packed, d, shmem_idx + j * BYTES_PER_INNER_LOOP);
            }
-#elif INIT_SRC0_SHMEM_Q8_1
+#endif // INIT_SRC0_SHMEM_Q8_0
+
+#if defined(INIT_SRC0_SHMEM_Q8_1)
            let block_byte_base = src0_idx * 36u; // BLOCK_SIZE_BYTES = 36u;
            let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base));
            let d = f16(dm[0]);
@@ -227,7 +238,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                    shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k] = q_val;
                }
            }
-#elif INIT_SRC0_SHMEM_MXFP4
+#endif // INIT_SRC0_SHMEM_Q8_1
+
+#if defined(INIT_SRC0_SHMEM_MXFP4)
            let block_byte_base = src0_idx * 17u;
            let eu8 = get_byte(load_u32_at_src0_aligned(block_byte_base), block_byte_base & 3u);
            let e = ldexp(1.0, i32(eu8) - 128);
@@ -244,11 +257,11 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                    shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = f16(q_hi);
                }
            }
-#endif
+#endif // INIT_SRC0_SHMEM_MXFP4
        }
    }
 }
-#endif
+#endif // legacy-quants

 // k-quants
 #if defined(INIT_SRC0_SHMEM_Q2_K) || defined(INIT_SRC0_SHMEM_Q3_K) || defined(INIT_SRC0_SHMEM_Q4_K) || defined(INIT_SRC0_SHMEM_Q5_K) || defined(INIT_SRC0_SHMEM_Q6_K)
@@ -284,7 +297,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3

        let src0_idx = batch_offset + global_m * params.stride_01 + block_k;

-#ifdef INIT_SRC0_SHMEM_Q2_K
+#if defined(INIT_SRC0_SHMEM_Q2_K)
        let block_byte_base  = src0_idx * 84u; // BLOCK_SIZE_BYTES =  84u;
        let scales_byte_base = block_byte_base;
        let qs_byte_base     = block_byte_base + 16u;
@@ -314,7 +327,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        let ml = dmin * f16(scale >> 4u);

        store_shmem_kquants(qs_vec4 * dl - ml, elem_idx);
-#elif INIT_SRC0_SHMEM_Q3_K
+#endif // INIT_SRC0_SHMEM_Q2_K
+
+#if defined(INIT_SRC0_SHMEM_Q3_K)
        let block_byte_base  = src0_idx * 110u; // BLOCK_SIZE_BYTES = 110u;
        let hmask_byte_base  = block_byte_base +  0u;
        let qs_byte_base     = block_byte_base + 32u;
@@ -355,7 +370,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        let dl         = d_all * (f16((scale_hi2 << 4u) | scale_low4) - 32.0);

        store_shmem_kquants(dl * q_vec4, elem_idx);
-#elif INIT_SRC0_SHMEM_Q4_K
+#endif // INIT_SRC0_SHMEM_Q3_K
+
+#if defined(INIT_SRC0_SHMEM_Q4_K)
        let block_byte_base = src0_idx * 144u; // BLOCK_SIZE_BYTES = 144u;
        let dm_byte_base    = block_byte_base +  0u;
        let scale_byte_base = block_byte_base +  4u;
@@ -399,7 +416,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        let ml = dmin * f16(mn);

        store_shmem_kquants(dl * qs_vec4 - vec4(ml, ml, ml, ml), elem_idx);
-#elif INIT_SRC0_SHMEM_Q5_K
+#endif // INIT_SRC0_SHMEM_Q4_K
+
+#if defined(INIT_SRC0_SHMEM_Q5_K)
        let block_byte_base = src0_idx * 176u; // BLOCK_SIZE_BYTES = 176u;
        let dm_byte_base    = block_byte_base +  0u;
        let scale_byte_base = block_byte_base +  4u;
@@ -456,7 +475,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        let ml = dmin * f16(mn);

        store_shmem_kquants((qh_vec4 + qs_lo4_vec4) * dl - vec4<f16>(ml, ml, ml, ml), elem_idx);
-#elif INIT_SRC0_SHMEM_Q6_K
+#endif // INIT_SRC0_SHMEM_Q5_K
+
+#if defined(INIT_SRC0_SHMEM_Q6_K)
        let block_byte_base  = src0_idx * 210u; // BLOCK_SIZE_BYTES = 210u;
        let ql_byte_base     = block_byte_base;
        let qh_byte_base     = block_byte_base + 128u;
@@ -497,17 +518,18 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        let scale      = get_byte_i32(scale_word, scale_byte & 3u);

        store_shmem_kquants(d * q_vec4 * f16(scale), elem_idx);
-#endif
+#endif // INIT_SRC0_SHMEM_Q6_K
    }
 }
 #endif // k-quants

-#ifdef INIT_SRC0_SHMEM_IQ4_NL
+#if defined(INIT_SRC0_SHMEM_IQ4_NL)
 const BLOCK_SIZE = 32u;
 const BLOCK_SIZE_BYTES = 18u;
+const NQ = 4u;

 fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
+    for (var elem_idx = thread_id * NQ; elem_idx < TILE_SRC0_SHMEM; elem_idx += NQ * TOTAL_WORKGROUP_SIZE) {
        let tile_m = elem_idx / TILE_K;
        let tile_k = elem_idx % TILE_K;
        let global_m = offset_m + tile_m;
@@ -519,408 +541,464 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        }

        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let k_in_block = global_k % BLOCK_SIZE; // k_in_block % 4 == 0;
+
+        let src0_idx = batch_offset + global_m * params.stride_01 + block_k;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_at_src0(block_byte_base);
+        let d_byte_base     = block_byte_base +  0u;
+        let qs_byte_base    = block_byte_base +  2u;

-        let pos       = k_in_block % 16u;
-        let nib_shift = (k_in_block / 16u) * 4u;
-        let q_packed  = load_u32_at_src0(block_byte_base + 2u + (pos / 4u) * 4u);
-        let nib       = (get_byte(q_packed, pos % 4u) >> nib_shift) & 0xFu;
+        let d = load_f16_at_src0(d_byte_base);

-        shmem[elem_idx] = d * f16(kvalues_iq4nl[nib]);
+        let id_qtr      = (k_in_block % 16u) / 4u;
+        let shift_phase = k_in_block / 16u;
+
+        let qs_u32    = load_u32_at_src0(qs_byte_base + 4u * id_qtr);
+
+        shmem[elem_idx + 0u] = d * f16(kvalues_iq4nl[(qs_u32 >> ( 0u + 4u * shift_phase)) & 0xFu]);
+        shmem[elem_idx + 1u] = d * f16(kvalues_iq4nl[(qs_u32 >> ( 8u + 4u * shift_phase)) & 0xFu]);
+        shmem[elem_idx + 2u] = d * f16(kvalues_iq4nl[(qs_u32 >> (16u + 4u * shift_phase)) & 0xFu]);
+        shmem[elem_idx + 3u] = d * f16(kvalues_iq4nl[(qs_u32 >> (24u + 4u * shift_phase)) & 0xFu]);
    }
 }
 #endif // INIT_SRC0_SHMEM_IQ4_NL

-#ifdef INIT_SRC0_SHMEM_IQ4_XS
+// i-quants (super block size: 256)
+#if defined(INIT_SRC0_SHMEM_IQ4_XS) || defined(INIT_SRC0_SHMEM_IQ1_S) || defined(INIT_SRC0_SHMEM_IQ1_M) || defined(INIT_SRC0_SHMEM_IQ2_XXS) \
+|| defined(INIT_SRC0_SHMEM_IQ2_XS) || defined(INIT_SRC0_SHMEM_IQ2_S) || defined(INIT_SRC0_SHMEM_IQ3_XXS) || defined(INIT_SRC0_SHMEM_IQ3_S)
 const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 136u;
+const NQ = 16u;
+
+fn store_shmem_iquants(val: vec4<f16>, idx: u32) {
+    shmem[idx] = val.x;
+    shmem[idx + 1] = val.y;
+    shmem[idx + 2] = val.z;
+    shmem[idx + 3] = val.w;
+}
+
+fn load_byte_at_src0_aligned(byte_offset: u32) -> u32 {
+    return get_byte(load_u32_at_src0_aligned(byte_offset), byte_offset % 4u);
+}
+
+#if defined(INIT_SRC0_SHMEM_IQ1_M) || defined(INIT_SRC0_SHMEM_IQ1_S)
+fn create_iq_gw4(dl: f32, gw: u32, shift_base: u32, delta: f32) -> vec4<f16> {
+    return vec4<f16>(
+            f16(dl * (f32((bitcast<i32>(((gw >> (shift_base + 0u)) & 3u) << 30u) >> 30u)) + delta)),
+            f16(dl * (f32((bitcast<i32>(((gw >> (shift_base + 2u)) & 3u) << 30u) >> 30u)) + delta)),
+            f16(dl * (f32((bitcast<i32>(((gw >> (shift_base + 4u)) & 3u) << 30u) >> 30u)) + delta)),
+            f16(dl * (f32((bitcast<i32>(((gw >> (shift_base + 6u)) & 3u) << 30u) >> 30u)) + delta)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ4_XS)
+fn create_iq_gw4(dl: f16, qs_u32: u32, shift_phase: u32) -> vec4<f16> {
+    return vec4<f16>(
+            dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase +  0u)) & 0xFu]),
+            dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase +  8u)) & 0xFu]),
+            dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase + 16u)) & 0xFu]),
+            dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase + 24u)) & 0xFu]),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ2_XXS)
+fn create_iq_gw4(ig: u32, grid_phase: u32) -> vec4<f32> {
+    return vec4<f32>(
+            f32(get_byte(iq2xxs_grid[(ig + grid_phase + 0u) / 4u], (ig + grid_phase + 0u) % 4u)),
+            f32(get_byte(iq2xxs_grid[(ig + grid_phase + 1u) / 4u], (ig + grid_phase + 1u) % 4u)),
+            f32(get_byte(iq2xxs_grid[(ig + grid_phase + 2u) / 4u], (ig + grid_phase + 2u) % 4u)),
+            f32(get_byte(iq2xxs_grid[(ig + grid_phase + 3u) / 4u], (ig + grid_phase + 3u) % 4u)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ2_XS)
+fn create_iq_gw4(ig: u32, grid_phase: u32) -> vec4<f32> {
+    return vec4<f32>(
+            f32(get_byte(iq2xs_grid[(ig + grid_phase + 0u) / 4u], (ig + grid_phase + 0u) % 4u)),
+            f32(get_byte(iq2xs_grid[(ig + grid_phase + 1u) / 4u], (ig + grid_phase + 1u) % 4u)),
+            f32(get_byte(iq2xs_grid[(ig + grid_phase + 2u) / 4u], (ig + grid_phase + 2u) % 4u)),
+            f32(get_byte(iq2xs_grid[(ig + grid_phase + 3u) / 4u], (ig + grid_phase + 3u) % 4u)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ2_S)
+fn create_iq_gw4(ig: u32, grid_phase: u32) -> vec4<f32> {
+    return vec4<f32>(
+            f32(get_byte(iq2s_grid[(ig + grid_phase + 0u) / 4u], (ig + grid_phase + 0u) % 4u)),
+            f32(get_byte(iq2s_grid[(ig + grid_phase + 1u) / 4u], (ig + grid_phase + 1u) % 4u)),
+            f32(get_byte(iq2s_grid[(ig + grid_phase + 2u) / 4u], (ig + grid_phase + 2u) % 4u)),
+            f32(get_byte(iq2s_grid[(ig + grid_phase + 3u) / 4u], (ig + grid_phase + 3u) % 4u)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ3_XXS)
+fn create_iq_gw4(ig: u32) -> vec4<f32> {
+    return vec4<f32>(
+            f32(get_byte(iq3xxs_grid[ig], 0)),
+            f32(get_byte(iq3xxs_grid[ig], 1)),
+            f32(get_byte(iq3xxs_grid[ig], 2)),
+            f32(get_byte(iq3xxs_grid[ig], 3)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ3_S)
+fn create_iq_gw4(ig: u32) -> vec4<f32> {
+    return vec4<f32>(
+            f32(get_byte(iq3s_grid[ig], 0)),
+            f32(get_byte(iq3s_grid[ig], 1)),
+            f32(get_byte(iq3s_grid[ig], 2)),
+            f32(get_byte(iq3s_grid[ig], 3)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ2_XXS) || defined(INIT_SRC0_SHMEM_IQ2_XS) || defined(INIT_SRC0_SHMEM_IQ2_S) \
+|| defined(INIT_SRC0_SHMEM_IQ3_XXS) || defined(INIT_SRC0_SHMEM_IQ3_S)
+fn create_iq2_m4(signs: u32, mask_phase: u32) -> vec4<f32> {
+    return vec4<f32>(
+            select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 0) & signs) != 0u),
+            select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 1) & signs) != 0u),
+            select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 2) & signs) != 0u),
+            select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 3) & signs) != 0u),
+        );
+}
+#endif

 fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
+    for (var elem_idx = thread_id * NQ; elem_idx < TILE_SRC0_SHMEM; elem_idx += NQ * TOTAL_WORKGROUP_SIZE) {
        let tile_m = elem_idx / TILE_K;
        let tile_k = elem_idx % TILE_K;
        let global_m = offset_m + tile_m;
        let global_k = k_outer + tile_k;

        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
+            let zero_vec4 = vec4<f16>(f16(0.0), f16(0.0), f16(0.0), f16(0.0));
+            store_shmem_iquants(zero_vec4, elem_idx +  0u);
+            store_shmem_iquants(zero_vec4, elem_idx +  4u);
+            store_shmem_iquants(zero_vec4, elem_idx +  8u);
+            store_shmem_iquants(zero_vec4, elem_idx + 12u);
            continue;
        }

        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let k_in_block = global_k % BLOCK_SIZE; // k_in_block % 16 == 0;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
+        let src0_idx = batch_offset + global_m * params.stride_01 + block_k;

-        let d_scales_h = load_u32_at_src0(block_byte_base);
+#if defined(INIT_SRC0_SHMEM_IQ4_XS)
+        let block_byte_base    = src0_idx * 136u; // BLOCK_SIZE_BYTES = 136u;
+        let d_byte_base        = block_byte_base +  0u;
+        let scales_l_byte_base = block_byte_base +  4u;
+        let qs_byte_base       = block_byte_base +  8u;
+
+        let d_scales_h = load_u32_at_src0_aligned(d_byte_base);
        let d          = bitcast<vec2<f16>>(d_scales_h).x;
        let scales_h   = d_scales_h >> 16u;

-        let ib  = k_in_block / 32u;
-        let pos = k_in_block % 32u;
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let scales_l_word = load_u32_at_src0(block_byte_base + 4u);
-        let ls_lo         = (get_byte(scales_l_word, ib / 2u) >> ((ib & 1u) * 4u)) & 0xFu;
-        let ls_hi         = ((scales_h >> (2u * ib)) & 3u) << 4u;
-        let dl            = d * f16(i32(ls_lo | ls_hi) - 32);
+        let scales_l_u32 = load_u32_at_src0_aligned(scales_l_byte_base);
+        let ls_lo        = (get_byte(scales_l_u32, sub_block / 2u) >> (4u * (sub_block % 2u))) & 0xFu;
+        let ls_hi        = ((scales_h >> (2u * sub_block)) & 3u) << 4u;
+        let dl           = d * f16(i32(ls_lo | ls_hi) - 32);

-        let iqs       = ib * 16u + (pos % 16u);
-        let nib_shift = (pos / 16u) * 4u;
-        let q_packed  = load_u32_at_src0(block_byte_base + 8u + (iqs / 4u) * 4u);
-        let nib       = (get_byte(q_packed, iqs % 4u) >> nib_shift) & 0xFu;
+        let qs_0_3_u32   = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block +  0u);
+        let qs_4_7_u32   = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block +  4u);
+        let qs_8_11_u32  = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block +  8u);
+        let qs_12_15_u32 = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block + 12u);

-        shmem[elem_idx] = dl * f16(kvalues_iq4nl[nib]);
-    }
-}
+        store_shmem_iquants(create_iq_gw4(dl, qs_0_3_u32,   phase), elem_idx +  0u);
+        store_shmem_iquants(create_iq_gw4(dl, qs_4_7_u32,   phase), elem_idx +  4u);
+        store_shmem_iquants(create_iq_gw4(dl, qs_8_11_u32,  phase), elem_idx +  8u);
+        store_shmem_iquants(create_iq_gw4(dl, qs_12_15_u32, phase), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ4_XS

-#ifdef INIT_SRC0_SHMEM_IQ1_S
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 50u;
+#if defined(INIT_SRC0_SHMEM_IQ1_S)
+        let block_byte_base = src0_idx * 50u; // BLOCK_SIZE_BYTES = 50u;
+        let d_byte_base     = block_byte_base +  0u;
+        let qs_byte_base    = block_byte_base +  2u;
+        let qh_byte_base    = block_byte_base + 34u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let qh_u16 = load_u32_at_src0(qh_byte_base + sub_block * 2u) & 0xFFFFu;
+        let qs_u16 = load_u32_at_src0(qs_byte_base + sub_block * 4u + phase * 2u) & 0xFFFFu;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
+        let dl    = d * (2.0 * f32((qh_u16 >> 12u) & 7u) + 1.0);
+        let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh_u16 & 0x8000u) != 0u);

-        let ib  = k_in_block / 32u;
-        let pos = k_in_block % 32u;
-        let l   = pos / 8u;
-        let j   = pos % 8u;
+        let gp0_grid_id = ((qs_u16 & 0xFFu) | (((qh_u16 >> (phase * 6u)) & 7u) << 8u)) * 8u;
+        let gp1_grid_id = (((qs_u16 >> 8) & 0xFFu) | (((qh_u16 >> (phase * 6u + 3u)) & 7u) << 8u)) * 8u;

-        let qh    = load_u32_at_src0(block_byte_base + 34u + ib * 2u) & 0xFFFFu;
-        let dl    = d * (2.0 * f32((qh >> 12u) & 7u) + 1.0);
-        let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000u) != 0u);
+        let gp0_gw = iq1_grid[(gp0_grid_id) / 16u];
+        let gp1_gw = iq1_grid[(gp1_grid_id) / 16u];

-        let qs_w = load_u32_at_src0(block_byte_base + 2u + ib * 4u);
-        let ig   = (get_byte(qs_w, l) | (((qh >> (3u * l)) & 7u) << 8u)) * 8u;
+        let gp0_shift_base = (gp0_grid_id % 16u) * 2u;
+        let gp1_shift_base = (gp1_grid_id % 16u) * 2u;

-        let gw = iq1_grid[(ig + j) / 16u];
-        let g  = (gw >> (((ig + j) % 16u) * 2u)) & 3u;
-        let gs = bitcast<i32>(g << 30u) >> 30u;
-
-        shmem[elem_idx] = f16(dl * (f32(gs) + delta));
-    }
-}
+        store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 0u, delta), elem_idx +  0u);
+        store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 8u, delta), elem_idx +  4u);
+        store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 0u, delta), elem_idx +  8u);
+        store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 8u, delta), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ1_S

-#ifdef INIT_SRC0_SHMEM_IQ1_M
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 56u;
+#if defined(INIT_SRC0_SHMEM_IQ1_M)
+        let block_byte_base  = src0_idx * 56u; // BLOCK_SIZE_BYTES = 56u;
+        let qs_byte_base     = block_byte_base +  0u;
+        let qh_byte_base     = block_byte_base + 32u;
+        let scales_byte_base = block_byte_base + 48u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
-
-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
-
-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
-
-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-
-        let scales0 = load_u32_at_src0(block_byte_base + 48u);
-        let scales1 = load_u32_at_src0(block_byte_base + 52u);
+        let scales0      = load_u32_at_src0_aligned(scales_byte_base);
+        let scales1      = load_u32_at_src0_aligned(scales_byte_base + 4u);
        let scale_packed = ((scales0 >> 12u) & 0xFu) |
                           ((scales0 >> 24u) & 0x00F0u) |
                           ((scales1 >>  4u) & 0x0F00u) |
                           ((scales1 >> 16u) & 0xF000u);
        let d = f32(bitcast<vec2<f16>>(scale_packed).x);

-        let ib  = k_in_block / 32u;
-        let pos = k_in_block % 32u;
-        let l   = pos / 8u;
-        let j   = pos % 8u;
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let scales = select(scales0, scales1, ib >= 4u);
-        let sw = (scales >> (16u * ((ib / 2u) % 2u))) & 0xFFFFu;
-        let s_pair = (sw >> (6u * (ib % 2u) + 3u * (l / 2u))) & 0x7u;
-        let dl     = d * f32(2u * s_pair + 1u);
+        let scale_u32 = select(scales0, scales1, sub_block >= 4u);
+        let scale_u3  = (scale_u32 >> (16u * ((sub_block / 2u) % 2u) + 6u * (sub_block % 2u) + 3u * phase)) & 0x7u;
+        let dl        = d * f32(2u * scale_u3 + 1u);

-        let qh_word = load_u32_at_src0(block_byte_base + 32u + (ib / 2u) * 4u);
-        let qh      = qh_word >> (16u * (ib % 2u));
-        let qh_nib  = (qh >> (4u * l)) & 0xFu;
+        let qh_u8  = (load_u32_at_src0_aligned(qh_byte_base + 4u * (sub_block / 2u)) >> (16u * (sub_block % 2u) + 8u * phase)) & 0xFFu;
+        let qs_u16 = (load_u32_at_src0_aligned(qs_byte_base + 4u * sub_block) >> (16u * phase)) & 0xFFFFu;

-        let qs_w = load_u32_at_src0(block_byte_base + ib * 4u);
-        let idx  = get_byte(qs_w, l) | ((qh_nib & 7u) << 8u);
-        let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh_nib & 0x8u) != 0u);
+        let gp0_grid_id = ((qs_u16 & 0xFFu) | ((qh_u8 & 7u) << 8u)) * 8u;
+        let gp0_delta   = select(IQ1_DELTA, -IQ1_DELTA, (qh_u8 & 0x8u) != 0u);

-        let ig = idx * 8u;
-        let gw = iq1_grid[(ig + j) / 16u];
-        let g  = (gw >> (((ig + j) % 16u) * 2u)) & 3u;
-        let gs = bitcast<i32>(g << 30u) >> 30u;
+        let gp1_grid_id = (((qs_u16 >> 8u) & 0xFFu) | (((qh_u8 >> 4u) & 7u) << 8u)) * 8u;
+        let gp1_delta   = select(IQ1_DELTA, -IQ1_DELTA, (qh_u8 & 0x80u) != 0u);

-        shmem[elem_idx] = f16(dl * (f32(gs) + delta));
-    }
-}
+        let gp0_gw = iq1_grid[(gp0_grid_id) / 16u];
+        let gp1_gw = iq1_grid[(gp1_grid_id) / 16u];
+
+        let gp0_shift_base = (gp0_grid_id % 16u) * 2u;
+        let gp1_shift_base = (gp1_grid_id % 16u) * 2u;
+
+        store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 0u, gp0_delta), elem_idx +  0u);
+        store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 8u, gp0_delta), elem_idx +  4u);
+        store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 0u, gp1_delta), elem_idx +  8u);
+        store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 8u, gp1_delta), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ1_M

-#ifdef INIT_SRC0_SHMEM_IQ2_XXS
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 66u;
+#if defined(INIT_SRC0_SHMEM_IQ2_XXS)
+        let block_byte_base = src0_idx * 66u; // BLOCK_SIZE_BYTES = 66u;
+        let d_byte_base     = block_byte_base +  0u;
+        let qs_byte_base    = block_byte_base +  2u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
-
-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
-
-        let entry_idx = k_in_block / 8u;
-        let j         = k_in_block % 8u;
-
-        let ib = entry_idx & ~3u;
-        let l  = entry_idx & 3u;
-
-        let aux0 = load_u32_at_src0(block_byte_base + 2u + ib * 2u);
-        let aux1 = load_u32_at_src0(block_byte_base + 2u + (ib + 2u) * 2u);
+        let aux0 = load_u32_at_src0(qs_byte_base + 8u * sub_block +  0u);
+        let aux1 = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u);
        let db   = d * (0.5 + f32(aux1 >> 28u)) * 0.25;

-        let ig    = get_byte(aux0, l) * 8u;
-        let is    = (aux1 >> (7u * l)) & 127u;
-        let signs = get_byte(ksigns_iq2xs[is / 4u], is % 4u);
+        let gp0_ig = get_byte(aux0, 2u * phase + 0u) * 8u;
+        let gp1_ig = get_byte(aux0, 2u * phase + 1u) * 8u;

-        let g = get_byte(iq2xxs_grid[(ig + j) / 4u], (ig + j) % 4u);
-        let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4u], j % 4u) & signs) != 0u);
+        let gp0_is = (aux1 >> (14u * phase + 0u)) & 127u;
+        let gp1_is = (aux1 >> (14u * phase + 7u)) & 127u;

-        shmem[elem_idx] = f16(db * f32(g) * m);
-    }
-}
+        let gp0_signs = get_byte(ksigns_iq2xs[gp0_is / 4u], gp0_is % 4u);
+        let gp1_signs = get_byte(ksigns_iq2xs[gp1_is / 4u], gp1_is % 4u);
+
+        let m_0_3_val4   = create_iq2_m4(gp0_signs, 0);
+        let m_4_7_val4   = create_iq2_m4(gp0_signs, 1);
+        let m_8_11_val4  = create_iq2_m4(gp1_signs, 0);
+        let m_12_15_val4 = create_iq2_m4(gp1_signs, 1);
+
+        let gw_0_3_val4   = create_iq_gw4(gp0_ig, 0);
+        let gw_4_7_val4   = create_iq_gw4(gp0_ig, 4);
+        let gw_8_11_val4  = create_iq_gw4(gp1_ig, 0);
+        let gw_12_15_val4 = create_iq_gw4(gp1_ig, 4);
+
+        store_shmem_iquants(vec4<f16>(db * m_0_3_val4 * gw_0_3_val4),     elem_idx +  0u);
+        store_shmem_iquants(vec4<f16>(db * m_4_7_val4 * gw_4_7_val4),     elem_idx +  4u);
+        store_shmem_iquants(vec4<f16>(db * m_8_11_val4 * gw_8_11_val4),   elem_idx +  8u);
+        store_shmem_iquants(vec4<f16>(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ2_XXS

-#ifdef INIT_SRC0_SHMEM_IQ2_XS
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 74u;
+#if defined(INIT_SRC0_SHMEM_IQ2_XS)
+        let block_byte_base  = src0_idx * 74u; // BLOCK_SIZE_BYTES = 74u;
+        let d_byte_base      = block_byte_base +  0u;
+        let qs_byte_base     = block_byte_base +  2u;
+        let scales_byte_base = block_byte_base + 66u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let scale = (load_byte_at_src0_aligned(scales_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu;
+        let db    = d * (0.5 + f32(scale)) * 0.25;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
+        let qs_u32 = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u * phase);

-        let entry_idx = k_in_block / 8u;
-        let j         = k_in_block % 8u;
+        let gp0_ig = (qs_u32 & 0x1FFu) * 8u;
+        let gp1_ig = ((qs_u32 >> 16u) & 0x1FFu) * 8u;

-        let ib = entry_idx & ~3u;
-        let l  = entry_idx & 3u;
+        let gp0_is = (qs_u32 >>  9u) & 0x7Fu;
+        let gp1_is = (qs_u32 >> 25u) & 0x7Fu;

-        let scales_word = load_u32_at_src0(block_byte_base + 66u + (ib / 16u) * 4u);
-        let s           = get_byte(scales_word, (ib % 16u) / 4u);
-        let s_nib       = select(s & 0xFu, (s >> 4u) & 0xFu, (l / 2u) != 0u);
-        let dl          = d * (0.5 + f32(s_nib)) * 0.25;
+        let gp0_signs = get_byte(ksigns_iq2xs[gp0_is / 4u], gp0_is % 4u);
+        let gp1_signs = get_byte(ksigns_iq2xs[gp1_is / 4u], gp1_is % 4u);

-        let qs_word = load_u32_at_src0(block_byte_base + 2u + (ib + l) * 2u);
-        let qs_val  = qs_word & 0xFFFFu;
-        let ig      = (qs_val & 511u) * 8u;
-        let is      = qs_val >> 9u;
-        let signs   = get_byte(ksigns_iq2xs[is / 4u], is % 4u);
+        let m_0_3_val4   = create_iq2_m4(gp0_signs, 0);
+        let m_4_7_val4   = create_iq2_m4(gp0_signs, 1);
+        let m_8_11_val4  = create_iq2_m4(gp1_signs, 0);
+        let m_12_15_val4 = create_iq2_m4(gp1_signs, 1);

-        let g = get_byte(iq2xs_grid[(ig + j) / 4u], (ig + j) % 4u);
-        let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4u], j % 4u) & signs) != 0u);
+        let gw_0_3_val4   = create_iq_gw4(gp0_ig, 0);
+        let gw_4_7_val4   = create_iq_gw4(gp0_ig, 4);
+        let gw_8_11_val4  = create_iq_gw4(gp1_ig, 0);
+        let gw_12_15_val4 = create_iq_gw4(gp1_ig, 4);

-        shmem[elem_idx] = f16(dl * f32(g) * m);
-    }
-}
+        store_shmem_iquants(vec4<f16>(db * m_0_3_val4 * gw_0_3_val4),     elem_idx +  0u);
+        store_shmem_iquants(vec4<f16>(db * m_4_7_val4 * gw_4_7_val4),     elem_idx +  4u);
+        store_shmem_iquants(vec4<f16>(db * m_8_11_val4 * gw_8_11_val4),   elem_idx +  8u);
+        store_shmem_iquants(vec4<f16>(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ2_XS

-#ifdef INIT_SRC0_SHMEM_IQ2_S
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 82u;
+#if defined(INIT_SRC0_SHMEM_IQ2_S)
+        let block_byte_base  = src0_idx * 82u; // BLOCK_SIZE_BYTES = 82u;
+        let d_byte_base      = block_byte_base +  0u;
+        let qs_byte_base     = block_byte_base +  2u;
+        let qh_byte_base     = block_byte_base + 66u;
+        let scales_byte_base = block_byte_base + 74u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let scale = (load_byte_at_src0_aligned(scales_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu;
+        let db    = d * (0.5 + f32(scale)) * 0.25;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
+        let qs_u16    = load_u32_at_src0(qs_byte_base + 4u * sub_block + 2u * phase) & 0xFFFFu;
+        let signs_u16 = load_u32_at_src0(qs_byte_base + 32u + 4u * sub_block + 2u * phase) & 0xFFFFu;
+        let qh_u4     = (load_byte_at_src0_aligned(qh_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu;

-        let ib = k_in_block / 32u;
-        let l  = (k_in_block % 32u) / 8u;
-        let j  = k_in_block % 8u;
+        let gp0_ig = ((qs_u16 & 0xFFu) | ((qh_u4 & 0x3u) << 8u)) * 8u;
+        let gp1_ig = (((qs_u16 >> 8u) & 0xFFu) | ((qh_u4 & 0xCu) << 6u)) * 8u;

-        let scales_word = load_u32_at_src0(block_byte_base + 74u + (ib / 4u) * 4u);
-        let s           = get_byte(scales_word, ib % 4u);
-        let s_nib       = select(s & 0xFu, (s >> 4u) & 0xFu, (l / 2u) != 0u);
-        let dl          = d * (0.5 + f32(s_nib)) * 0.25;
+        let gp0_signs = get_byte(signs_u16, 0);
+        let gp1_signs = get_byte(signs_u16, 1);

-        let qs_word = load_u32_at_src0(block_byte_base + 2u + ib * 4u);
-        let qh_word = load_u32_at_src0(block_byte_base + 66u + (ib / 4u) * 4u);
-        let qh_b    = (get_byte(qh_word, ib % 4u) << (8u - 2u * l)) & 0x300u;
-        let ig      = (get_byte(qs_word, l) | qh_b) * 8u;
+        let m_0_3_val4   = create_iq2_m4(gp0_signs, 0);
+        let m_4_7_val4   = create_iq2_m4(gp0_signs, 1);
+        let m_8_11_val4  = create_iq2_m4(gp1_signs, 0);
+        let m_12_15_val4 = create_iq2_m4(gp1_signs, 1);

-        let signs_word = load_u32_at_src0(block_byte_base + 34u + ib * 4u);
-        let signs      = get_byte(signs_word, l);
+        let gw_0_3_val4   = create_iq_gw4(gp0_ig, 0);
+        let gw_4_7_val4   = create_iq_gw4(gp0_ig, 4);
+        let gw_8_11_val4  = create_iq_gw4(gp1_ig, 0);
+        let gw_12_15_val4 = create_iq_gw4(gp1_ig, 4);

-        let g = get_byte(iq2s_grid[(ig + j) / 4u], (ig + j) % 4u);
-        let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4u], j % 4u) & signs) != 0u);
-
-        shmem[elem_idx] = f16(dl * f32(g) * m);
-    }
-}
+        store_shmem_iquants(vec4<f16>(db * m_0_3_val4 * gw_0_3_val4),     elem_idx +  0u);
+        store_shmem_iquants(vec4<f16>(db * m_4_7_val4 * gw_4_7_val4),     elem_idx +  4u);
+        store_shmem_iquants(vec4<f16>(db * m_8_11_val4 * gw_8_11_val4),   elem_idx +  8u);
+        store_shmem_iquants(vec4<f16>(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ2_S

-#ifdef INIT_SRC0_SHMEM_IQ3_XXS
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 98u;
+#if defined(INIT_SRC0_SHMEM_IQ3_XXS)
+        let block_byte_base = src0_idx * 98u; // BLOCK_SIZE_BYTES = 98u;
+        let d_byte_base     = block_byte_base +  0u;
+        let qs_byte_base    = block_byte_base +  2u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let qs_u32   = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u * phase);
+        let sign_u32 = load_u32_at_src0(qs_byte_base + 64u + 4u * sub_block);
+        let db       = d * (0.5 + f32(sign_u32 >> 28u)) * 0.5;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
+        let ig_0_3   = get_byte(qs_u32, 0);
+        let ig_4_7   = get_byte(qs_u32, 1);
+        let ig_8_11  = get_byte(qs_u32, 2);
+        let ig_12_15 = get_byte(qs_u32, 3);

-        let ib_pair = k_in_block / 32u;
-        let in_pair = k_in_block % 32u;
-        let l       = in_pair / 8u;
-        let in_l    = in_pair % 8u;
-        let k2      = in_l / 4u;
-        let j       = in_l % 4u;
+        let gp0_is = (sign_u32 >> (14u * phase + 0u)) & 0x7Fu;
+        let gp1_is = (sign_u32 >> (14u * phase + 7u)) & 0x7Fu;

-        let ib            = ib_pair * 2u;
-        let sc_sign_off   = block_byte_base + 2u + (ib + 32u) * 2u;
-        let sc_sign       = load_u32_at_src0(sc_sign_off);
-        let db            = d * (0.5 + f32(sc_sign >> 28u)) * 0.5;
-        let is            = (sc_sign >> (7u * l)) & 127u;
-        let signs         = get_byte(ksigns_iq2xs[is / 4u], is % 4u);
+        let gp0_signs = get_byte(ksigns_iq2xs[gp0_is / 4u], gp0_is % 4u);
+        let gp1_signs = get_byte(ksigns_iq2xs[gp1_is / 4u], gp1_is % 4u);

-        let ig_word = load_u32_at_src0(block_byte_base + 2u + (ib * 2u + l) * 2u) & 0xFFFFu;
-        let ig_byte = get_byte(ig_word, k2);
-        let g       = get_byte(iq3xxs_grid[ig_byte], j);
-        let m       = select(1.0, -1.0, (get_byte(kmask_iq2xs[k2], j) & signs) != 0u);
+        let m_0_3_val4   = create_iq2_m4(gp0_signs, 0);
+        let m_4_7_val4   = create_iq2_m4(gp0_signs, 1);
+        let m_8_11_val4  = create_iq2_m4(gp1_signs, 0);
+        let m_12_15_val4 = create_iq2_m4(gp1_signs, 1);

-        shmem[elem_idx] = f16(db * f32(g) * m);
-    }
-}
+        let gw_0_3_val4   = create_iq_gw4(ig_0_3);
+        let gw_4_7_val4   = create_iq_gw4(ig_4_7);
+        let gw_8_11_val4  = create_iq_gw4(ig_8_11);
+        let gw_12_15_val4 = create_iq_gw4(ig_12_15);
+
+        store_shmem_iquants(vec4<f16>(db * m_0_3_val4 * gw_0_3_val4),     elem_idx +  0u);
+        store_shmem_iquants(vec4<f16>(db * m_4_7_val4 * gw_4_7_val4),     elem_idx +  4u);
+        store_shmem_iquants(vec4<f16>(db * m_8_11_val4 * gw_8_11_val4),   elem_idx +  8u);
+        store_shmem_iquants(vec4<f16>(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ3_XXS

-#ifdef INIT_SRC0_SHMEM_IQ3_S
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 110u;
+#if defined(INIT_SRC0_SHMEM_IQ3_S)
+        let block_byte_base  = src0_idx * 110u; // BLOCK_SIZE_BYTES = 110u;
+        let d_byte_base      = block_byte_base +   0u;
+        let qs_byte_base     = block_byte_base +   2u;
+        let qh_byte_base     = block_byte_base +  66u;
+        let signs_byte_base  = block_byte_base +  74u;
+        let scales_byte_base = block_byte_base + 106u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let scale = (load_byte_at_src0_aligned(scales_byte_base + 1u * (sub_block / 2u)) >> (4u * (sub_block % 2u))) & 0xFu;
+        let db    = d * (1.0 + 2.0 * f32(scale));

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
+        let qs_u32    = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u * phase);
+        let qh_u4     = (load_byte_at_src0_aligned(qh_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu;
+        let signs_u16 = (load_u32_at_src0(signs_byte_base + 4u * sub_block + 2u * phase)) & 0xFFFFu;

-        let ib   = k_in_block / 64u;
-        let rest = k_in_block % 64u;
-        let k    = rest / 32u;
-        let in_k = rest % 32u;
-        let l    = in_k / 8u;
-        let in_l = in_k % 8u;
-        let k2   = in_l / 4u;
-        let j    = in_l % 4u;
+        let ig_0_3   = ((qs_u32 >>  0u) & 0xFFu) | ((qh_u4 & 0x1u) << 8u);
+        let ig_4_7   = ((qs_u32 >>  8u) & 0xFFu) | ((qh_u4 & 0x2u) << 7u);
+        let ig_8_11  = ((qs_u32 >> 16u) & 0xFFu) | ((qh_u4 & 0x4u) << 6u);
+        let ig_12_15 = ((qs_u32 >> 24u) & 0xFFu) | ((qh_u4 & 0x8u) << 5u);

-        let scales_word = load_u32_at_src0(block_byte_base + 106u);
-        let s           = get_byte(scales_word, ib);
-        let s_nib       = select(s & 0xFu, (s >> 4u) & 0xFu, k != 0u);
-        let dl          = d * (1.0 + 2.0 * f32(s_nib));
+        let gp0_signs = get_byte(signs_u16, 0);
+        let gp1_signs = get_byte(signs_u16, 1);

-        let qh_word = load_u32_at_src0(block_byte_base + 66u + (ib / 2u) * 4u);
-        let qh_byte = get_byte(qh_word, (ib % 2u) * 2u + k);
+        let m_0_3_val4   = create_iq2_m4(gp0_signs, 0);
+        let m_4_7_val4   = create_iq2_m4(gp0_signs, 1);
+        let m_8_11_val4  = create_iq2_m4(gp1_signs, 0);
+        let m_12_15_val4 = create_iq2_m4(gp1_signs, 1);

-        let ig_word = load_u32_at_src0(block_byte_base + 2u + (ib * 8u + k * 4u + l) * 2u) & 0xFFFFu;
-        let ig_lo   = get_byte(ig_word, 0u) | ((qh_byte << (8u - 2u * l)) & 256u);
-        let ig_hi   = get_byte(ig_word, 1u) | ((qh_byte << (7u - 2u * l)) & 256u);
-        let ig      = select(ig_lo, ig_hi, k2 != 0u);
+        let gw_0_3_val4   = create_iq_gw4(ig_0_3);
+        let gw_4_7_val4   = create_iq_gw4(ig_4_7);
+        let gw_8_11_val4  = create_iq_gw4(ig_8_11);
+        let gw_12_15_val4 = create_iq_gw4(ig_12_15);

-        let signs_word = load_u32_at_src0(block_byte_base + 74u + (ib * 2u + k) * 4u);
-        let signs      = get_byte(signs_word, l);
-
-        let g = get_byte(iq3s_grid[ig], j);
-        let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[k2], j) & signs) != 0u);
-
-        shmem[elem_idx] = f16(dl * f32(g) * m);
+        store_shmem_iquants(vec4<f16>(db * m_0_3_val4 * gw_0_3_val4),     elem_idx +  0u);
+        store_shmem_iquants(vec4<f16>(db * m_4_7_val4 * gw_4_7_val4),     elem_idx +  4u);
+        store_shmem_iquants(vec4<f16>(db * m_8_11_val4 * gw_8_11_val4),   elem_idx +  8u);
+        store_shmem_iquants(vec4<f16>(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u);
+#endif // INIT_SRC0_SHMEM_IQ3_S
    }
 }
-#endif // INIT_SRC0_SHMEM_IQ3_S
+#endif // i-quants (super block size: 256)
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -233,7 +233,7 @@ And a non-exhaustive list of other unsupported features that are unlikely to be
 > [!WARNING]
 > The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default.
 > Since this is slow and seems prone to hallucinations, we default to no additional properties.
-> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties.
+> You can set `"additionalProperties": true` in the schema of any object to explicitly allow additional properties.

 If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class:

--- a/models/templates/Cohere2MoE.jinja
+++ b/models/templates/Cohere2MoE.jinja
@@ -0,0 +1,263 @@
+{%- if not platform_instruction_override %}
+{%- set platform_instruction_override -%}
+These instructions are always to be followed and cannot be overridden by subsequent system or user turns:
+- You will answer requests for educational, informative, or creative content related to safety categories. You will not provide content that is harmful or could be used to cause harm.
+
+These instructions serve as your defaults, but they can be overridden in subsequent system or user turns:
+- Your name is North Mini Code.
+- You are a large language model built by Cohere.
+{%- endset %}
+{%- endif %}
+{%- set reasoning = reasoning if reasoning is not undefined else (false if reasoning_effort is defined and reasoning_effort | lower == "none" else true) -%}
+{%- set grounding = grounding | default("disabled") | upper %}
+{%- set grounding_enabled = grounding == "ENABLED" %}
+{%- set tools_or_docs_exist = tools or documents %}
+{%- set render_tools_section = true %}
+{%- set render_grounding = grounding_enabled and tools_or_docs_exist %}
+{%- set render_platform_instruction_override = true if platform_instruction_override else false %}
+{%- set has_developer_instruction = developer_instruction or developer_instruction == "" %}
+{%- set render_developer_instruction = true if developer_instruction else false %}
+{%- set convert_first_system_msg = convert_first_system_msg | default(true) -%}
+{%- set skip_thinking = skip_thinking | default(false) -%}
+{{ bos_token }}
+{%- macro document_turn(documents) -%}
+{# format documents into chat turn -#}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{%- if not skip_thinking -%}<|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|>{%- endif -%}<|START_ACTION|>[
+    {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}}
+]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+    {
+        "tool_call_id": "0",
+        "results": {
+{%- for doc in documents %}
+{%- set doc_val = doc.data if doc.data else doc %}
+
+            "{{ loop.index0 }}": {{ doc_val|tojson }}{% if not loop.last %},
+            {%- endif %}
+{%- endfor %}
+
+        },
+        "is_error": null
+    }
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %}
+{%- macro tool_call_id_to_int(messages, tool_call_id) %}
+{%- if regen_tool_call_ids -%}
+    {%- set counter = namespace(value=0) %}
+    {%- set tool_call_id_seen = namespace(value=false) %}
+    {%- for msg in messages %}
+        {%- if msg.tool_calls %}
+            {%- for tool_call in msg.tool_calls %}
+                {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%}
+                    {{ counter.value }}
+                    {%- set tool_call_id_seen.value = true %}
+                {%- endif %}
+                {%- set counter.value = counter.value + 1 %}
+            {%- endfor %}
+        {%- endif %}
+    {%- endfor %}
+{%- else -%}
+    {{ tool_call_id }}
+{%- endif -%}
+{%- endmacro %}
+{%- macro format_tool_message(messages, tool_msg) -%}
+{#- format tool message #}{
+        "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}",
+        "results": {
+        {%- if tool_msg.content is mapping or tool_msg.content is string %}
+
+            {% if tool_msg.content is string -%}
+                {%- set text_wrapper = {"content": tool_msg.content} -%}
+            {%- else -%}
+                {%- set text_wrapper = tool_msg.content -%}
+            {%- endif %}
+            "0": {{ text_wrapper|tojson }}
+        {%- else %}
+            {%- for content in tool_msg.content %}
+
+            "{{ loop.index0 }}": {{ print_tool_content(content) }}{% if not loop.last %},{% endif %}
+            {%- endfor %}
+        {%- endif %}
+
+        },
+        "is_error": null
+    }
+{%- endmacro -%}
+{%- macro print_tool_content(item) %}
+{%- if item.type|lower == "text" -%}
+{%- set text_wrapper = {"content": item.text} -%}
+{{ text_wrapper|tojson }}
+{%- elif item.type|lower == "document" and item.document and "data" in item.document -%}
+{{ item.document.data|tojson }}
+{%- else -%}
+{{ item|tojson }}
+{%- endif -%}
+{%- endmacro %}
+{%- macro print_msg(msg) %}
+    {%- if msg is string -%}
+<|START_TEXT|>{{ msg }}<|END_TEXT|>
+    {%- elif msg.content is string -%}
+<|START_TEXT|>{{ msg.content }}<|END_TEXT|>
+    {%- else %}
+        {%- set last_was_text = namespace(value=false) %}
+        {%- for content in msg.content %}
+            {%- if content.type|lower == "text" -%}
+                {%- if not last_was_text.value -%}
+                    <|START_TEXT|>
+                {%- endif -%}
+    {{ content.text }}
+                {%- if loop.last -%}
+                  <|END_TEXT|>
+                {%- endif %}
+                {%- set last_was_text.value = true -%}
+            {%- else -%}
+                {%- if last_was_text.value -%}
+                    <|END_TEXT|>
+                {%- endif -%}
+                {%- set last_was_text.value = false -%}
+            {%- endif -%}
+            {%- if content.type|lower == "image" -%}
+                {%- if content.data -%}
+{{ content.data }}
+                {%- else -%}
+<|IMG_PATCH|>
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor %}
+    {%- endif %}
+{%- endmacro %}
+{%- macro print_thinking(msg) %}
+    {%- if msg.reasoning -%}
+{{ msg.reasoning }}
+    {%- elif msg.reasoning_content -%}
+{{ msg.reasoning_content }}
+    {%- elif msg.thinking -%}
+{{ msg.thinking }}
+    {%- elif msg.content and msg.content[0].thinking -%}
+{{ msg.content[0].thinking }}
+    {%- endif %}
+{%- endmacro %}
+{%- if messages and messages[0]['role']|lower == 'system' and not has_developer_instruction and convert_first_system_msg %}{%- set developer_instruction = messages[0] %}{%- set render_developer_instruction = true %}{%- set initial_instruction_message = true %}{% endif %}
+{%- set json_object = true if response_format and response_format.type == "json_object" else false %}
+{%- set json_schema = (response_format.json_schema or response_format.schema) if response_format %}
+{%- set json_mode = json_object or json_schema %}
+{%- set tool_idx = namespace(value=0) %}
+{%- set tool_ids_seen = namespace(value=[]) %}
+{%- set regen_tool_call_ids = regen_tool_call_ids | default(true) -%}
+{%- set sent_documents = namespace(value=false) -%}
+
+{%- if render_tools_section or render_platform_instruction_override or render_grounding or json_mode -%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TEXT|>
+{%- elif not render_developer_instruction -%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+{%- endif %}
+
+{%- set rendered_platform_turn_chunk = false %}
+
+{%- if render_platform_instruction_override -%}
+{{ platform_instruction_override }}
+{% set rendered_platform_turn_chunk = true %}
+{%- else %}
+{%- endif %}
+
+{%- if render_grounding -%}
+{%- if rendered_platform_turn_chunk %}
+
+{% endif -%}
+Note that both your responses and reflections can be grounded. Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "<co>" and "</co>" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "<co>span</co: 0:[1,2],1:[0]>" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1".
+{% set rendered_platform_turn_chunk = true %}
+{%- endif %}
+
+{%- if render_tools_section %}
+{%- if rendered_platform_turn_chunk %}
+
+{% endif %}
+# Available Tools
+```json
+[
+{% if tools_or_docs_exist %}
+{%- if documents %}
+    {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}
+    {%- if tools %},
+    {% else %}
+
+    {% endif %}
+{%- endif %}
+{%- for tool in tools %}
+    {"name": "{{ tool['function']['name'] }}", "description": "{{ tool['function']['description'] }}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}
+    {%- if not loop.last %},{% endif %}
+
+{% endfor %}
+{%- else %}
+
+{% endif %}
+]
+```
+{%- set rendered_platform_turn_chunk = true %}
+{%- endif -%}
+
+{%- if json_mode -%}
+{%- if rendered_platform_turn_chunk %}
+
+
+{% endif -%}
+When generating JSON objects, do not generate block markers. Generate an object directly without prefixing with ```json. Return only the JSON and nothing else.
+    {%- if json_schema %}
+
+Your output should adhere to the following json schema:
+{{ json_schema }}
+    {%- endif -%}
+{%- set rendered_platform_turn_chunk = true %}
+{%- endif %}
+{%- if rendered_platform_turn_chunk -%}
+<|END_TEXT|><|END_OF_TURN_TOKEN|>
+{%- elif not render_developer_instruction -%}
+<|END_OF_TURN_TOKEN|>
+{%- endif %}
+{%- if render_developer_instruction -%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ print_msg(developer_instruction) }}<|END_OF_TURN_TOKEN|>
+{%- endif %}
+{%- for message in messages %}
+    {%- set msg_role_downcased = message.role | lower %}
+    {%- if msg_role_downcased == 'system' and (not (loop.first and initial_instruction_message)) -%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ print_msg(message) }}<|END_OF_TURN_TOKEN|>
+    {%- elif msg_role_downcased == 'user' -%}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ print_msg(message) }}<|END_OF_TURN_TOKEN|>
+        {%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %}
+    {%- elif msg_role_downcased == 'assistant' or msg_role_downcased == 'chatbot' -%}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+        {%- if message.tool_calls %}
+            {% if not skip_thinking %}
+                {% if message.tool_plan -%}
+                    <|START_THINKING|>{{ message.tool_plan }}<|END_THINKING|>
+                {%- elif message.reasoning or message.reasoning_content or message.thinking or (message.content and message.content[0].type == "thinking") -%}
+                    <|START_THINKING|>{{ print_thinking(message) }}<|END_THINKING|>
+                {%- endif %}
+            {%- endif %}<|START_ACTION|>[
+            {%- for tc in message.tool_calls %}
+
+    {"tool_call_id": "{%- if regen_tool_call_ids -%}{{ tool_idx.value }}{%- else -%}{{ tc.id }}{%- endif -%}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %}
+                {%- set tool_idx.value = tool_idx.value + 1 %}
+            {%- endfor %}
+
+]<|END_ACTION|><|END_OF_TURN_TOKEN|>
+        {%- else -%}
+            {% if (message.reasoning or message.reasoning_content or message.thinking or (message.content and message.content[0].type == "thinking")) and not skip_thinking -%}
+                <|START_THINKING|>{{ print_thinking(message) }}<|END_THINKING|>
+            {%- endif -%}
+            {{ print_msg(message) }}<|END_OF_TURN_TOKEN|>
+        {%- endif %}
+    {%- elif msg_role_downcased == 'tool' and message.tool_call_id not in tool_ids_seen.value -%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+    {{ format_tool_message(messages, message) }}
+        {%- for msg in messages[loop.index0 + 1:] %}
+
+            {%- if msg.role | lower == 'tool' %},
+    {{ format_tool_message(messages, msg) }}
+                {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %}
+            {%- else %}
+                {%- break %}
+            {%- endif %}
+        {%- endfor %}
+
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>
+    {%- endif %}
+{%- endfor %}{%- if add_generation_prompt -%}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if reasoning %}<|START_THINKING|>{% else %}<|START_THINKING|><|END_THINKING|>{% endif %}{%- endif %}
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2280,7 +2280,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                clean_spaces = false;
                ignore_merges = true;
            } else if (
-                tokenizer_pre == "tiny_aya") {
+                tokenizer_pre == "tiny_aya" ||
+                tokenizer_pre == "cohere2moe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA;
                clean_spaces = false;
            } else if (
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -2644,6 +2644,100 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .run();
    }

+    {
+        // Cohere2 MoE (North Code) - dedicated parser.
+        // Marker-wrapped format: <|START_THINKING|>...<|END_THINKING|> then either
+        // <|START_TEXT|>...<|END_TEXT|> (content) or <|START_ACTION|>[json]<|END_ACTION|> (tools).
+        // The generation prompt forces a leading <|START_THINKING|>, so model output begins inside
+        // the thinking block: test inputs start with the reasoning body, not the <|START_THINKING|> tag.
+        auto tst = peg_tester("models/templates/Cohere2MoE.jinja", detailed_debug);
+
+        // Content with reasoning, extracted.
+        tst.test("I'm\nthinking<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .expect(message_assist_thoughts)
+            .run();
+
+        // Content with reasoning, reasoning_format=NONE -> thinking kept inline in content (markers preserved).
+        tst.test("I'm\nthinking<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>")
+            .expect(message_assist_thoughts_unparsed_r7b)
+            .run();
+
+        // Content with empty thinking block.
+        tst.test("<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .expect(message_assist)
+            .run();
+
+        // Single tool call with reasoning.
+        tst.test(
+               "I'm\nthinking<|END_THINKING|>"
+               "<|START_ACTION|>[\n"
+               "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+               "]<|END_ACTION|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .tools({ special_function_tool })
+            .expect(message_assist_thoughts_call_idx)
+            .run();
+
+        // Single tool call, empty thinking block (no reasoning content).
+        tst.test(
+               "<|END_THINKING|>"
+               "<|START_ACTION|>[\n"
+               "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+               "]<|END_ACTION|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .tools({ special_function_tool })
+            .expect(message_assist_call_idx)
+            .run();
+
+        // Tool call with an array argument (todo_list).
+        tst.test(
+               "<|END_THINKING|>"
+               "<|START_ACTION|>[\n"
+               "    {\"tool_call_id\": \"0\", \"tool_name\": \"todo_list\", \"parameters\": {\"todos\": [\"buy milk\", \"walk dog\"]}}\n"
+               "]<|END_ACTION|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .tools({ todo_list })
+            .expect(simple_assist_msg("", "", "todo_list", "{\"todos\": [\"buy milk\", \"walk dog\"]}", "0"))
+            .run();
+
+        // Parallel tool calls with reasoning.
+        tst.test(
+               "I'm\nthinking<|END_THINKING|>"
+               "<|START_ACTION|>[\n"
+               "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}},\n"
+               "    {\"tool_call_id\": \"1\", \"tool_name\": \"python\", \"parameters\": {\"code\": \"print('hey')\"}}\n"
+               "]<|END_ACTION|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .parallel_tool_calls(true)
+            .tools({ special_function_tool, python_tool })
+            .expect_reasoning("I'm\nthinking")
+            .expect_tool_calls({
+                { "special_function", R"({"arg1": 1})", "0" },
+                { "python", "{\"code\": \"print('hey')\"}", "1" },
+            })
+            .run();
+
+        // Tools available but the model answers with content instead of calling a tool.
+        tst.test("I'm\nthinking<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .tools({ special_function_tool })
+            .expect(message_assist_thoughts)
+            .run();
+
+        // Partial tool call (streaming): name/id resolved before arguments arrive.
+        tst.test(
+               "I'm\nthinking<|END_THINKING|>"
+               "<|START_ACTION|>[\n"
+               "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", ")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .tools({ special_function_tool })
+            .is_partial(true)
+            .expect(message_assist_thoughts_partial_call)
+            .run();
+    }
+
    {
        // Google Gemma 2 2B - does not support tool calling
        auto tst = peg_tester("models/templates/google-gemma-2-2b-it.jinja");
--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
@@ -601,8 +601,8 @@ static void test_filters(testing & t) {
        "hello jinja"
    );

-    test_template(t, "length list",
-        "{{ items|length }}",
+    test_template(t, "length (count alias) list",
+        "{{ items|count }}",
        {{"items", json::array({1, 2, 3})}},
        "3"
    );
@@ -711,8 +711,8 @@ static void test_filters(testing & t) {
        "fallback"
    );

-    test_template(t, "default with falsy value",
-        "{{ ''|default('fallback', true) }}",
+    test_template(t, "default (d alias) with falsy value",
+        "{{ ''|d('fallback', true) }}",
        json::object(),
        "fallback"
    );
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -97,11 +97,18 @@ struct cli_context {
                task.params.chat_parser_params.parser.load(chat_params.parser);
            }

+            // Copy the preserved tokens into the sampling params
+            const llama_vocab * vocab = llama_model_get_vocab(
+                llama_get_model(ctx_server.get_llama_context()));
+            for (const auto & token : chat_params.preserved_tokens) {
+                auto ids = common_tokenize(vocab, token, false, true);
+                if (ids.size() == 1) {
+                    task.params.sampling.preserved_tokens.insert(ids[0]);
+                }
+            }
+
            // reasoning budget sampler
            if (!chat_params.thinking_end_tag.empty()) {
-                const llama_vocab * vocab = llama_model_get_vocab(
-                    llama_get_model(ctx_server.get_llama_context()));
-
                task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens;
                task.params.sampling.generation_prompt = chat_params.generation_prompt;

--- a/tools/ui/src/app.html
+++ b/tools/ui/src/app.html
@@ -9,7 +9,10 @@

 		<link rel="manifest" href="./manifest.webmanifest" />

-		<meta name="viewport" content="width=device-width, initial-scale=1" />
+		<meta
+			name="viewport"
+			content="width=device-width, initial-scale=1, interactive-widget=resizes-content"
+		/>
 		%sveltekit.head%
 	</head>

--- a/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte
@@ -56,6 +56,7 @@

 	const showToolCallInProgress = $derived(config().showToolCallInProgress as boolean);
 	const showThoughtInProgress = $derived(config().showThoughtInProgress as boolean);
+	const renderThinkingAsMarkdown = $derived(config().renderThinkingAsMarkdown as boolean);

 	const hasReasoningError = $derived(
 		isLastAssistantMessage ? !!agenticLastError(message.convId) : false
@@ -316,9 +317,13 @@
 			onToggle={() => toggleExpanded(index, section)}
 		>
 			<div class="pt-3">
-				<div class="text-xs leading-relaxed break-words whitespace-pre-wrap">
-					{section.content}
-				</div>
+				{#if renderThinkingAsMarkdown}
+					<MarkdownContent content={section.content} attachments={message?.extra} />
+				{:else}
+					<div class="text-xs leading-relaxed break-words whitespace-pre-wrap">
+						{section.content}
+					</div>
+				{/if}
 			</div>
 		</CollapsibleContentBlock>
 	{:else if section.type === AgenticSectionType.REASONING_PENDING}
@@ -336,9 +341,13 @@
 			onToggle={() => toggleExpanded(index, section)}
 		>
 			<div class="pt-3">
-				<div class="text-xs leading-relaxed break-words whitespace-pre-wrap">
-					{section.content}
-				</div>
+				{#if renderThinkingAsMarkdown}
+					<MarkdownContent content={section.content} attachments={message?.extra} />
+				{:else}
+					<div class="text-xs leading-relaxed break-words whitespace-pre-wrap">
+						{section.content}
+					</div>
+				{/if}
 			</div>
 		</CollapsibleContentBlock>
 	{/if}
--- a/tools/ui/src/lib/components/ui/sidebar/sidebar-provider.svelte
+++ b/tools/ui/src/lib/components/ui/sidebar/sidebar-provider.svelte
@@ -41,7 +41,7 @@
 	data-slot="sidebar-wrapper"
 	style="--sidebar-width: {sidebar.sidebarWidth}; --sidebar-min-width: {SIDEBAR_MIN_WIDTH}; --sidebar-max-width: {SIDEBAR_MAX_WIDTH}; --sidebar-width-icon: {SIDEBAR_WIDTH_ICON}; {style}"
 	class={cn(
-		'group/sidebar-wrapper flex min-h-svh w-full has-data-[variant=inset]:bg-sidebar',
+		'group/sidebar-wrapper flex flex-col h-dvh w-full has-data-[variant=inset]:bg-sidebar',
 		className
 	)}
 	bind:this={ref}
--- a/tools/ui/src/lib/constants/image-size.ts
+++ b/tools/ui/src/lib/constants/image-size.ts
@@ -1 +1,3 @@
 export const MEGAPIXELS_TO_PIXELS = 1_000_000;
+
+export const HEIC_JPEG_QUALITY = 0.85;
--- a/tools/ui/src/lib/constants/settings-keys.ts
+++ b/tools/ui/src/lib/constants/settings-keys.ts
@@ -33,6 +33,7 @@ export const SETTINGS_KEYS = {
 	SHOW_MODEL_TAGS: 'showModelTags',
 	SHOW_BUILD_VERSION: 'showBuildVersion',
 	SHOW_SYSTEM_MESSAGE: 'showSystemMessage',
+	RENDER_THINKING_AS_MARKDOWN: 'renderThinkingAsMarkdown',
 	// Sampling
 	TEMPERATURE: 'temperature',
 	DYNATEMP_RANGE: 'dynatemp_range',
--- a/tools/ui/src/lib/constants/settings-registry.ts
+++ b/tools/ui/src/lib/constants/settings-registry.ts
@@ -282,6 +282,18 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 					paramType: SyncableParameterType.BOOLEAN
 				}
 			},
+			{
+				key: SETTINGS_KEYS.RENDER_THINKING_AS_MARKDOWN,
+				label: 'Render thinking as Markdown',
+				help: 'Render the reasoning/thinking block content as formatted Markdown instead of plain text.',
+				defaultValue: true,
+				type: SettingsFieldType.CHECKBOX,
+				section: SETTINGS_SECTION_SLUGS.DISPLAY,
+				sync: {
+					serverKey: SETTINGS_KEYS.RENDER_THINKING_AS_MARKDOWN,
+					paramType: SyncableParameterType.BOOLEAN
+				}
+			},
 			{
 				key: SETTINGS_KEYS.FULL_HEIGHT_CODE_BLOCKS,
 				label: 'Use full height code blocks',
--- a/tools/ui/src/lib/constants/supported-file-types.ts
+++ b/tools/ui/src/lib/constants/supported-file-types.ts
@@ -63,6 +63,10 @@ export const IMAGE_FILE_TYPES = {
 	[FileTypeImage.SVG]: {
 		extensions: [FileExtensionImage.SVG],
 		mimeTypes: [MimeTypeImage.SVG]
+	},
+	[FileTypeImage.HEIC]: {
+		extensions: [FileExtensionImage.HEIC, FileExtensionImage.HEIF],
+		mimeTypes: [MimeTypeImage.HEIC, MimeTypeImage.HEIF]
 	}
 } as const;

--- a/tools/ui/src/lib/enums/files.enums.ts
+++ b/tools/ui/src/lib/enums/files.enums.ts
@@ -25,7 +25,9 @@ export enum FileTypeImage {
 	PNG = 'png',
 	GIF = 'gif',
 	WEBP = 'webp',
-	SVG = 'svg'
+	SVG = 'svg',
+	HEIC = 'heic',
+	HEIF = 'heif'
 }

 export enum FileTypeAudio {
@@ -90,7 +92,9 @@ export enum FileExtensionImage {
 	PNG = '.png',
 	GIF = '.gif',
 	WEBP = '.webp',
-	SVG = '.svg'
+	SVG = '.svg',
+	HEIC = '.heic',
+	HEIF = '.heif'
 }

 export enum FileExtensionAudio {
@@ -205,7 +209,9 @@ export enum MimeTypeImage {
 	WEBP = 'image/webp',
 	SVG = 'image/svg+xml',
 	ICO = 'image/x-icon',
-	ICO_MICROSOFT = 'image/vnd.microsoft.icon'
+	ICO_MICROSOFT = 'image/vnd.microsoft.icon',
+	HEIC = 'image/heic',
+	HEIF = 'image/heif'
 }

 export enum MimeTypeText {
--- a/tools/ui/src/lib/hooks/use-pwa.svelte.ts
+++ b/tools/ui/src/lib/hooks/use-pwa.svelte.ts
@@ -53,6 +53,8 @@ export function usePwa() {
 	// This comparison detects server upgrades for non-PWA users.
 	$effect(() => {
 		if (!browser) return;
+		// PWA pages update via the service worker path; the storage check is the non-PWA fallback only
+		if (navigator.serviceWorker?.controller) return;

 		const currentVersion = versionStore.value;
 		if (!currentVersion) return;
--- a/tools/ui/src/lib/utils/file-type.ts
+++ b/tools/ui/src/lib/utils/file-type.ts
@@ -30,6 +30,8 @@ export function getFileTypeCategory(mimeType: string): FileTypeCategory | null {
 		case MimeTypeImage.GIF:
 		case MimeTypeImage.WEBP:
 		case MimeTypeImage.SVG:
+		case MimeTypeImage.HEIC:
+		case MimeTypeImage.HEIF:
 			return FileTypeCategory.IMAGE;

 		// Audio
@@ -118,6 +120,8 @@ export function getFileTypeCategoryByExtension(filename: string): FileTypeCatego
 		case FileExtensionImage.GIF:
 		case FileExtensionImage.WEBP:
 		case FileExtensionImage.SVG:
+		case FileExtensionImage.HEIC:
+		case FileExtensionImage.HEIF:
 			return FileTypeCategory.IMAGE;

 		// Audio
--- a/tools/ui/src/lib/utils/heic-to-jpeg.ts
+++ b/tools/ui/src/lib/utils/heic-to-jpeg.ts
@@ -0,0 +1,56 @@
+import { MimeTypeImage } from '$lib/enums';
+import { HEIC_JPEG_QUALITY } from '$lib/constants/image-size';
+
+// heic requires a relatively large decoder, in order to reduce primary bundle size
+// we lazily load this decoder from a CDN when needed, and cache it for future conversions
+const HEIC_TO_CDN_URL = 'https://cdn.jsdelivr.net/npm/heic-to@1.5.2/dist/heic-to.js';
+
+interface HeicToModule {
+	heicTo(args: { blob: Blob; type: string; quality?: number }): Promise<Blob>;
+}
+
+let modulePromise: Promise<HeicToModule> | null = null;
+
+/**
+ * Lazily load the heic-to decoder from the CDN and cache it
+ * @returns Promise resolving to the heic-to module
+ */
+function getHeicTo(): Promise<HeicToModule> {
+	if (!modulePromise) {
+		modulePromise = import(/* @vite-ignore */ HEIC_TO_CDN_URL) as Promise<HeicToModule>;
+	}
+
+	return modulePromise;
+}
+
+/**
+ * Convert a HEIC/HEIF file to a compressed JPEG data URL
+ * @param file - The HEIC/HEIF file to convert
+ * @returns Promise resolving to JPEG data URL
+ */
+export async function heicFileToJpegDataURL(file: File | Blob): Promise<string> {
+	const { heicTo } = await getHeicTo();
+	const jpegBlob = await heicTo({
+		blob: file,
+		type: MimeTypeImage.JPEG,
+		quality: HEIC_JPEG_QUALITY
+	});
+
+	return new Promise((resolve, reject) => {
+		const reader = new FileReader();
+		reader.onload = () => resolve(reader.result as string);
+		reader.onerror = () => reject(reader.error);
+		reader.readAsDataURL(jpegBlob);
+	});
+}
+
+/**
+ * Check if a MIME type represents a HEIC/HEIF image
+ * @param mimeType - The MIME type to check
+ * @returns True if the MIME type is image/heic or image/heif
+ */
+export function isHeicMimeType(mimeType: string): boolean {
+	const normalized = mimeType.trim().toLowerCase();
+
+	return normalized === MimeTypeImage.HEIC || normalized === MimeTypeImage.HEIF;
+}
--- a/tools/ui/src/lib/utils/process-uploaded-files.ts
+++ b/tools/ui/src/lib/utils/process-uploaded-files.ts
@@ -1,5 +1,6 @@
 import { isSvgMimeType, svgBase64UrlToPngDataURL } from './svg-to-png';
 import { isWebpMimeType, webpBase64UrlToPngDataURL } from './webp-to-png';
+import { heicFileToJpegDataURL, isHeicMimeType } from './heic-to-jpeg';
 import { FileTypeCategory } from '$lib/enums';
 import { SETTINGS_KEYS } from '$lib/constants';
 import { modelsStore } from '$lib/stores/models.svelte';
@@ -68,7 +69,7 @@ export async function processFilesToChatUploaded(
 			if (getFileTypeCategory(file.type) === FileTypeCategory.IMAGE) {
 				let preview = await readFileAsDataURL(file);

-				// Normalize SVG and WebP to PNG in previews
+				// Normalize SVG and WebP to PNG, and HEIC to compressed JPEG, in previews
 				if (isSvgMimeType(file.type)) {
 					try {
 						preview = await svgBase64UrlToPngDataURL(preview);
@@ -81,6 +82,13 @@ export async function processFilesToChatUploaded(
 					} catch (err) {
 						console.error('Failed to convert WebP to PNG:', err);
 					}
+				} else if (isHeicMimeType(file.type)) {
+					try {
+						preview = await heicFileToJpegDataURL(file);
+					} catch (err) {
+						console.error('Failed to convert HEIC to PNG:', err);
+						continue;
+					}
 				}

 				results.push({ ...base, preview });
--- a/tools/ui/src/routes/+layout.svelte
+++ b/tools/ui/src/routes/+layout.svelte
@@ -312,7 +312,7 @@
 	/>

 	<Sidebar.Provider bind:open={sidebarOpen}>
-		<div class="flex h-screen w-full">
+		<div class="flex h-full w-full grow">
 			<Sidebar.Root variant="floating" class="h-full"
 				><SidebarNavigation bind:this={chatSidebar} /></Sidebar.Root
 			>
Author	SHA1	Message	Date
Masashi Yoshimura	6e9007ae61	ggml-webgpu: improve i-quants mul_mat performance and speed up prefill (#24530 ) * Improve prefill speeds for i-quants * Fix #if defined() usage in preprocessor guards.	2026-06-14 18:15:30 -07:00
Sigbjørn Skjæret	dd4623a74f	convert : fix lora base model arch retrieval (#24621 )	2026-06-15 00:55:26 +02:00
franitel	ef8268feee	fix(ui): render thinking/reasoning block content as markdown (#24611 ) * fix(ui): render thinking/reasoning block content as markdown * feat(ui): add toggle setting for thinking block markdown rendering	2026-06-14 22:56:56 +02:00
Nicolas Mowen	5f04dc7ac3	ui: Add HEIC/HEIF image support (#24137 ) * Add boilerplate for file types * Add heic-to and implement conversion * Load heic library from CDN * Use jpg instead of png for conversion * Move const to constants file	2026-06-14 20:42:16 +02:00
Piotr Wilkin (ilintar)	aedb2a5e9c	chat: add dedicated Cohere2MoE (North Code) parser (#24615 ) * chat: add dedicated Cohere2MoE (North Code) parser * Some renames to make @CISC happy :>	2026-06-14 20:17:40 +02:00
Mohammad Athar	8edaca9034	docs : fix typos in CUDA-FEDORA.md and grammars/README.md (#24459 )	2026-06-15 01:33:38 +08:00
Alexander Batischev	20c5266f8a	docker: specify registry to simplify Podman builds (#24607 )	2026-06-15 01:27:20 +08:00
Pascal	fd5869fb62	UI/mobile keyboard and pwa popup fixes (#24610 ) * ui: make mobile layout keyboard-aware via interactive-widget and dvh shell anchor * ui: fix duplicate PWA refresh popup by scoping the storage check to non-PWA pages	2026-06-14 18:35:00 +02:00
Amos Wong	1fd6dfe9f3	ui : fix ui clipping in mobile due to incorrect height setup (#24605 )	2026-06-14 16:15:51 +02:00
Sigbjørn Skjæret	acd79d603c	jinja : add count/d/e filter aliases (#24606 )	2026-06-14 15:07:31 +02:00
Michael Wand	6e14286eda	cli : fix not copying preserved tokens (#24258 )	2026-06-14 11:52:15 +02:00
Bartowski	8ed274ef46	Add cohere2moe to llama-vocab for TINY_AYA (#24601 )	2026-06-14 09:04:46 +02:00
Sigbjørn Skjæret	46722116b9	ci : use CUDA label for cuda backend (#24594 )	2026-06-14 08:27:52 +02:00
Sigbjørn Skjæret	c2ba3e47a2	add sycl to check-release (#24583 )	2026-06-14 09:42:26 +08:00