ggml-webgpu: improve i-quants mul_mat performance and speed up prefill (#24530 )

* Improve prefill speeds for i-quants * Fix #if defined() usage in preprocessor guards.
convert : fix lora base model arch retrieval (#24621 )
2026-06-15 07:15:45 +02:00 · 2026-06-14 18:15:30 -07:00 · 2026-06-15 00:55:26 +02:00 · 2026-06-14 22:56:56 +02:00 · 2026-06-14 20:42:16 +02:00 · 2026-06-14 20:17:40 +02:00
51 changed files with 1736 additions and 382 deletions
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

@@ -37,7 +37,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -3,9 +3,9 @@ ARG UBUNTU_VERSION=24.04
 ARG CUDA_VERSION=12.8.1
 ARG GCC_VERSION=14
 # Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_DEV_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_RUN_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -5,7 +5,7 @@ ARG APP_REVISION=N/A

 ## Build Image

-FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
+FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 ARG LEVEL_ZERO_VERSION=1.28.2
@@ -42,7 +42,7 @@ RUN mkdir -p /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

-FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
+FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ascendai/cann:$ASCEND_VERSION AS build
+FROM docker.io/ascendai/cann:$ASCEND_VERSION AS build

 WORKDIR /app

@@ -30,7 +30,7 @@ RUN echo "Building with static libs" && \
    cmake --build build --config Release --target llama-completion

 # TODO: use image with NNRT
-FROM ascendai/cann:$ASCEND_VERSION AS runtime
+FROM docker.io/ascendai/cann:$ASCEND_VERSION AS runtime

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -2,9 +2,9 @@ ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc4.3.0
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_DEV_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_RUN_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -23,7 +23,7 @@ ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

 ## Build Image
-FROM ubuntu:${UBUNTU_VERSION} AS build
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build

 # Pass proxy args to build stage
 ARG http_proxy
@@ -88,7 +88,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base Runtime Image
-FROM ubuntu:${UBUNTU_VERSION} AS base
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base

 # Pass proxy args to runtime stage
 ARG http_proxy
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -5,7 +5,7 @@ ARG ROCM_VERSION=7.2.1
 ARG AMDGPU_VERSION=7.2.1

 # Target the ROCm build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+ARG BASE_ROCM_DEV_CONTAINER=docker.io/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -5,7 +5,7 @@ ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

 ### Build Llama.cpp stage
-FROM gcc:${GCC_VERSION} AS build
+FROM docker.io/gcc:${GCC_VERSION} AS build

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
@@ -55,7 +55,7 @@ COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion


 ### Base image
-FROM ubuntu:${UBUNTU_VERSION} AS base
+FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils
@@ -33,7 +33,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.devops/zendnn.Dockerfile
+++ b/.devops/zendnn.Dockerfile
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
@@ -30,7 +30,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
+FROM docker.io/ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -12,7 +12,7 @@ SYCL:
            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
-Nvidia GPU:
+CUDA:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-cuda.h
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -783,6 +783,8 @@ jobs:
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

  windows-sycl:
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: windows-2022

@@ -891,6 +893,8 @@ jobs:
          name: llama-bin-win-sycl-x64.zip

  ubuntu-24-sycl:
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    strategy:
      matrix:
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1979,6 +1979,146 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
    return data;
 }

+// Cohere2 MoE (a.k.a. "North Code") parser.
+//
+// The assistant turn is fully marker-wrapped:
+//   <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+//     <|START_THINKING|>{reasoning}<|END_THINKING|>
+//     then EITHER content:    <|START_TEXT|>{content}<|END_TEXT|>
+//          OR     tool calls: <|START_ACTION|>[
+//                                 {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ...
+//                             ]<|END_ACTION|>
+//   <|END_OF_TURN_TOKEN|>
+//
+// The generation prompt forces a leading <|START_THINKING|> (when reasoning is enabled, which is
+// the template default), so the model's output continues from *inside* the thinking block. The
+// parser literal therefore only covers the stable <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> prefix
+// and the reasoning rule consumes the <|START_THINKING|> ... <|END_THINKING|> markers itself,
+// regardless of whether they came from the generation prompt or the generated text.
+static common_chat_params common_chat_params_init_cohere2moe(const common_chat_template &          tmpl,
+                                                              const autoparser::generation_params & inputs) {
+    common_chat_params data;
+
+    const std::string TURN_START    = "<|START_OF_TURN_TOKEN|>";
+    const std::string TURN_END      = "<|END_OF_TURN_TOKEN|>";
+    const std::string CHATBOT       = "<|CHATBOT_TOKEN|>";
+    const std::string USER          = "<|USER_TOKEN|>";
+    const std::string SYSTEM        = "<|SYSTEM_TOKEN|>";
+    const std::string THINK_START   = "<|START_THINKING|>";
+    const std::string THINK_END     = "<|END_THINKING|>";
+    const std::string TEXT_START    = "<|START_TEXT|>";
+    const std::string TEXT_END      = "<|END_TEXT|>";
+    const std::string ACTION_START  = "<|START_ACTION|>";
+    const std::string ACTION_END    = "<|END_ACTION|>";
+    const std::string RESULT_START  = "<|START_TOOL_RESULT|>";
+    const std::string RESULT_END    = "<|END_TOOL_RESULT|>";
+
+    // Stable prefix of the generation prompt that precedes the (forced) <|START_THINKING|> marker.
+    const std::string GEN_PREFIX = TURN_START + CHATBOT;
+
+    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt  = common_chat_template_generation_prompt_impl(tmpl, inputs);
+    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking  = true;
+    data.thinking_start_tag = THINK_START;
+    data.thinking_end_tag   = THINK_END;
+    data.preserved_tokens   = {
+        TURN_START, TURN_END, CHATBOT, USER, SYSTEM,
+        THINK_START, THINK_END,
+        TEXT_START, TEXT_END,
+        ACTION_START, ACTION_END,
+        RESULT_START, RESULT_END,
+    };
+
+    // Split the rendered prompt into per-role message spans. Tool results are rendered with the
+    // system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before
+    // the plain "system" one (it is a strict superset, and the role split tries delimiters in order).
+    data.message_spans = common_chat_split_by_role(data.prompt, {
+        { "assistant", GEN_PREFIX },
+        { "user",      TURN_START + USER },
+        { "tool",      TURN_START + SYSTEM + RESULT_START },
+        { "system",    TURN_START + SYSTEM },
+    });
+
+    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
+
+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = GEN_PREFIX + THINK_START + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += THINK_END + TEXT_START + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        auto generation_prompt = p.literal(GEN_PREFIX);
+        auto end               = p.end();
+
+        // The thinking block is always present (the generation prompt forces <|START_THINKING|>).
+        // When extracting reasoning, capture its body; otherwise keep the whole block (markers
+        // included) inline as content, matching reasoning_format=NONE conventions.
+        common_peg_parser reasoning = p.eps();
+        if (extract_reasoning) {
+            reasoning = p.optional(p.literal(THINK_START) +
+                                   p.reasoning(p.until_one_of({ THINK_END, TEXT_START, ACTION_START })) +
+                                   p.optional(p.literal(THINK_END)));
+        } else {
+            reasoning = p.optional(p.content(p.literal(THINK_START) +
+                                             p.until_one_of({ THINK_END, TEXT_START, ACTION_START }) +
+                                             p.optional(p.literal(THINK_END))));
+        }
+
+        auto text_content = p.literal(TEXT_START) + p.content(p.until(TEXT_END)) + p.optional(p.literal(TEXT_END));
+
+        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
+            return generation_prompt + reasoning + text_content + p.optional(p.literal(TURN_END)) + end;
+        }
+
+        auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+        // <|START_ACTION|>[ {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ... ]<|END_ACTION|>
+        auto tool_calls = p.standard_json_tools(ACTION_START, ACTION_END, inputs.tools, inputs.parallel_tool_calls,
+                                                /* force_tool_calls = */ true,
+                                                /* name_key         = */ "tool_name",
+                                                /* args_key         = */ "parameters",
+                                                /* array_wrapped    = */ true,
+                                                /* function_is_key  = */ false,
+                                                /* call_id_key      = */ "",
+                                                /* gen_call_id_key  = */ "tool_call_id",
+                                                /* parameters_order = */ { "tool_call_id", "tool_name", "parameters" });
+
+        // Content and tool calls are mutually exclusive in this format.
+        common_peg_parser body = require_tools ? tool_calls : p.choice({ tool_calls, text_content });
+
+        return generation_prompt + reasoning + body + p.optional(p.literal(TURN_END)) + end;
+    });
+
+    data.parser = parser.save();
+
+    if (include_grammar) {
+        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                auto         schema   = function.at("parameters");
+                builder.resolve_refs(schema);
+            });
+            parser.build_grammar(builder, data.grammar_lazy);
+        });
+
+        data.grammar_triggers = {
+            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ACTION_START }
+        };
+    }
+
+    return data;
+}
+
 namespace workaround {

 static void map_developer_role_to_system(json & messages) {
@@ -2227,6 +2367,15 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

+    // Cohere2 MoE / North Code - marker-wrapped format with <|START_TEXT|> content and
+    // <|START_ACTION|> JSON tool calls. <|START_TEXT|> is unique to this template (the older
+    // Command-R templates use <|START_RESPONSE|>).
+    if (src.find("<|START_TEXT|>") != std::string::npos &&
+        src.find("<|START_ACTION|>") != std::string::npos) {
+        LOG_DBG("Using specialized template: Cohere2 MoE\n");
+        return common_chat_params_init_cohere2moe(tmpl, params);
+    }
+
    if (is_lfm2_template(src)) {
        LOG_DBG("Using specialized template: LFM2\n");
        return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true);
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@@ -316,12 +316,22 @@ value filter_expression::execute_impl(context & ctx) {

    JJ_DEBUG("Applying filter to %s", input->type().c_str());

+    auto set_filter_alias = [](auto & filter_id) {
+        if (filter_id == "count") {
+            filter_id = "length";
+        } else if (filter_id == "d") {
+            filter_id = "default";
+        } else if (filter_id == "e") {
+            filter_id = "escape";
+        } else if (filter_id == "trim") {
+            filter_id = "strip";
+        }
+    };
+
    if (is_stmt<identifier>(filter)) {
        auto filter_id = cast_stmt<identifier>(filter)->val;

-        if (filter_id == "trim") {
-            filter_id = "strip"; // alias
-        }
+        set_filter_alias(filter_id);
        JJ_DEBUG("Applying filter '%s' to %s", filter_id.c_str(), input->type().c_str());
        // TODO: Refactor filters so this coercion can be done automatically
        if (!input->is_undefined() && !is_val<value_string>(input) && (
@@ -345,9 +355,7 @@ value filter_expression::execute_impl(context & ctx) {
        }
        auto filter_id = cast_stmt<identifier>(call->callee)->val;

-        if (filter_id == "trim") {
-            filter_id = "strip"; // alias
-        }
+        set_filter_alias(filter_id);
        JJ_DEBUG("Applying filter '%s' with arguments to %s", filter_id.c_str(), input->type().c_str());
        func_args args(ctx);
        for (const auto & arg_expr : call->args) {
@@ -761,9 +769,9 @@ value member_expression::execute_impl(context & ctx) {

        if (is_stmt<slice_expression>(this->property)) {
            auto s = cast_stmt<slice_expression>(this->property);
-            value start_val = s->start_expr ? s->start_expr->execute(ctx) : mk_val<value_int>(0);
-            value stop_val  = s->stop_expr  ? s->stop_expr->execute(ctx)  : mk_val<value_int>(arr_size);
            value step_val  = s->step_expr  ? s->step_expr->execute(ctx)  : mk_val<value_int>(1);
+            value start_val = s->start_expr ? s->start_expr->execute(ctx) : (step_val->as_int() < 0 ? mk_val<value_int>(arr_size - 1) : mk_val<value_int>(0));
+            value stop_val  = s->stop_expr  ? s->stop_expr->execute(ctx)  : (step_val->as_int() < 0 ? mk_val<value_int>(-1) : mk_val<value_int>(arr_size));

            // translate to function call: obj.slice(start, stop, step)
            JJ_DEBUG("Member expression is a slice: start %s, stop %s, step %s",
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -90,14 +90,14 @@ static T slice(const T & array, int64_t start, int64_t stop, int64_t step = 1) {
            stop_val = std::min(stop_val, len);
        }
    } else {
-        start_val = len - 1;
+        start_val = start;
        if (start_val < 0) {
-            start_val = std::max(len + start_val, (int64_t)-1);
+            start_val = std::max(len + start_val, (int64_t)0);
        } else {
            start_val = std::min(start_val, len - 1);
        }

-        stop_val = -1;
+        stop_val = stop;
        if (stop_val < -1) {
            stop_val = std::max(len + stop_val, (int64_t)-1);
        } else {
--- a/conversion/init.py
+++ b/conversion/init.py
@@ -40,6 +40,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "ChatGLMModel": "chatglm",
    "CodeShellForCausalLM": "codeshell",
    "CogVLMForCausalLM": "cogvlm",
+    "Cohere2MoeForCausalLM": "command_r",
    "Cohere2ForCausalLM": "command_r",
    "CohereForCausalLM": "command_r",
    "DbrxForCausalLM": "dbrx",
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -1195,7 +1195,7 @@ class TextModel(ModelBase):
            self.gguf_writer.add_embedding_length(n_embd)
            logger.info(f"gguf: embedding length = {n_embd}")

-        if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
+        if (n_ff := self.find_hparam(["prefix_dense_intermediate_size", "intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
            logger.info(f"gguf: feed forward length = {n_ff}")

@@ -1280,7 +1280,7 @@ class TextModel(ModelBase):
            self.gguf_writer.add_expert_group_used_count(n_group_used)
            logger.info(f"gguf: expert groups used count = {n_group_used}")

-        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
+        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func", "expert_selection_fn"], optional=True)) is not None:
            if score_func == "sigmoid":
                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
            elif score_func == "softmax":
@@ -1495,6 +1495,9 @@ class TextModel(ModelBase):
        if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1":
            # ref: https://huggingface.co/CohereLabs/tiny-aya-base
            res = "tiny_aya"
+        if chkhsh == "52df12b4c8d4176e7481aab4b6e8454d1fd0a210a04a574f6d4e067d10e23c3e":
+            # ref: https://huggingface.co/CohereLabs/North-Mini-Code-1.0
+            res = "cohere2moe"
        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
            res = "qwen2"
--- a/conversion/command_r.py
+++ b/conversion/command_r.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import re
 from typing import Iterable, TYPE_CHECKING

 import torch
@@ -55,3 +56,122 @@ class Cohere2Model(TextModel):
            return

        yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("Cohere2MoeForCausalLM")
+class Cohere2MoeModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.COHERE2MOE
+    _n_main_layers: int | None = None
+    _expert_tensor_re = re.compile(
+        r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.(down_proj|gate_proj|up_proj)\.weight"
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if (n_nextn := int(self.hparams.get("num_nextn_predict_layers", 0) or 0)) > 0 and not self.no_mtp:
+            self.block_count += n_nextn
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+        self._experts: list[dict[str, Tensor]] = [{} for _ in range(self.block_count)]
+
+    def _set_vocab_gpt2(self) -> None:
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        expert_intermediate_size = hparams["intermediate_size"]
+        mlp_layer_types = hparams.get("mlp_layer_types")
+        n_dense_lead = hparams.get("first_k_dense_replace", 0)
+        if mlp_layer_types is not None:
+            n_dense_lead = next((i for i, t in enumerate(mlp_layer_types) if t != "dense"), len(mlp_layer_types))
+
+        super().set_gguf_parameters()
+
+        self.gguf_writer.add_logit_scale(hparams["logit_scale"])
+        self.gguf_writer.add_sliding_window(hparams["sliding_window"])
+        self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
+        self.gguf_writer.add_leading_dense_block_count(n_dense_lead)
+        self.gguf_writer.add_expert_weights_norm(hparams.get("norm_topk_prob", False))
+        if (num_shared_experts := hparams.get("num_shared_experts", 0)) > 0:
+            if hparams.get("shared_expert_combination_strategy", "average") != "average":
+                raise ValueError("Cohere2 MoE only supports average shared expert combination")
+            self.gguf_writer.add_expert_shared_count(num_shared_experts)
+            self.gguf_writer.add_expert_shared_feed_forward_length(expert_intermediate_size * num_shared_experts)
+        if (n_nextn := hparams.get("num_nextn_predict_layers", 0)) > 0 and not self.no_mtp:
+            self.gguf_writer.add_nextn_predict_layers(n_nextn)
+        self.gguf_writer.add_rope_dimension_count(hparams["head_dim"])
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+    def index_tensors(self, remote_hf_model_id: str | None = None):
+        hparams = {**self.hparams, **self.hparams.get("text_config", {})}
+        self._n_main_layers = hparams.get("num_hidden_layers")
+        type(self)._n_main_layers = self._n_main_layers
+        return super().index_tensors(remote_hf_model_id=remote_hf_model_id)
+
+    @classmethod
+    def filter_tensors(cls, item):
+        if (titem := super().filter_tensors(item)) is None:
+            return None
+        name, gen = titem
+
+        if cls._n_main_layers is not None:
+            is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers
+            if is_mtp and cls.no_mtp:
+                return None
+            if cls.mtp_only and not is_mtp and name not in (
+                "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
+            ):
+                return None
+
+        return name, gen
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith(".bias"):
+            if torch.any(data_torch != 0):
+                raise ValueError(f"Bias tensor {name!r} is not zero.")
+            logger.debug(f"Skipping bias tensor {name!r}.")
+            return
+
+        if (m := self._expert_tensor_re.fullmatch(name)) is not None:
+            n_experts = self.hparams["num_experts"]
+            layer_idx = int(m.group(1))
+            assert bid is None or bid == layer_idx
+
+            self._experts[layer_idx][name] = data_torch
+
+            expected = {
+                f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
+                for xid in range(n_experts)
+                for w_name in ("down_proj", "gate_proj", "up_proj")
+            }
+            if expected.issubset(self._experts[layer_idx]):
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[layer_idx][ename])
+                        del self._experts[layer_idx][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+                    merged_name = f"model.layers.{layer_idx}.mlp.experts.{w_name}.weight"
+
+                    yield from super().modify_tensors(data_torch, merged_name, layer_idx)
+            return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        experts = [k for d in self._experts for k in d.keys()]
+        if len(experts) > 0:
+            raise ValueError(f"Unprocessed experts: {experts}")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -100,6 +100,7 @@ models = [
    {"name": "refact",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
    {"name": "tiny_aya",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", },
+    {"name": "cohere2moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/North-Mini-Code-1.0", },
    {"name": "qwen2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -25,7 +25,7 @@ import gguf
 from gguf.constants import GGUFValueType

 # reuse model definitions from the conversion/ package
-from conversion import LazyTorchTensor, ModelBase, get_model_class
+from conversion import LazyTorchTensor, ModelBase, get_model_class, ModelType, get_model_architecture

 logger = logging.getLogger("lora-to-gguf")

@@ -396,12 +396,12 @@ if __name__ == '__main__':
        hparams = ModelBase.load_hparams(dir_base_model, False)

    with torch.inference_mode():
+        model_arch = get_model_architecture(hparams, ModelType.TEXT)
        try:
-            model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0]
-            logger.info("Using model architecture: %s", model_arch)
            model_class = get_model_class(model_arch)
+            logger.info("Using model architecture: %s", model_arch)
        except NotImplementedError:
-            logger.error(f"Model {hparams['architectures'][0]} is not supported")
+            logger.error(f"Model {model_arch} is not supported")
            sys.exit(1)

        class LoraModel(model_class):  # ty: ignore[unsupported-base]
--- a/docs/backend/CUDA-FEDORA.md
+++ b/docs/backend/CUDA-FEDORA.md
@@ -270,7 +270,7 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t

 ---

-**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox.
+**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with the toolbox.

 **Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide.

--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
@@ -98,6 +98,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
 }
 #endif // INIT_SRC0_SHMEM_Q1_0

+// legacy-quants
 #if defined(INIT_SRC0_SHMEM_Q4_0) || defined(INIT_SRC0_SHMEM_Q4_1) || defined(INIT_SRC0_SHMEM_Q5_0) || defined(INIT_SRC0_SHMEM_Q5_1) || defined(INIT_SRC0_SHMEM_Q8_0) || defined(INIT_SRC0_SHMEM_Q8_1) || defined(INIT_SRC0_SHMEM_MXFP4)
 const BLOCK_SIZE = 32u;
 // the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
@@ -124,7 +125,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        if (global_m < params.m && global_block_k < params.k / BLOCK_SIZE) {
            let src0_idx = batch_offset + global_m * params.stride_01 + global_block_k;

-#ifdef INIT_SRC0_SHMEM_Q4_0
+#if defined(INIT_SRC0_SHMEM_Q4_0)
            let block_byte_base = src0_idx * 18u; // BLOCK_SIZE_BYTES = 18u;
            let d = load_f16_at_src0(block_byte_base);

@@ -134,7 +135,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                let q_packed = load_u32_at_src0(q_byte_offset);
                dequant_q4_0_packed_to_shmem(q_packed, d, shmem_idx + j * BYTES_PER_INNER_LOOP);
            }
-#elif INIT_SRC0_SHMEM_Q4_1
+#endif // INIT_SRC0_SHMEM_Q4_0
+
+#if defined(INIT_SRC0_SHMEM_Q4_1)
            let block_byte_base = src0_idx * 20u; // BLOCK_SIZE_BYTES = 20u;
            let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base));
            let d = f16(dm[0]);
@@ -153,7 +156,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                    shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi;
                }
            }
-#elif INIT_SRC0_SHMEM_Q5_0
+#endif // INIT_SRC0_SHMEM_Q4_1
+
+#if defined(INIT_SRC0_SHMEM_Q5_0)
            let block_byte_base = src0_idx * 22u; // BLOCK_SIZE_BYTES = 22u;

            let d  = load_f16_at_src0(block_byte_base);
@@ -176,7 +181,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                    shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi;
                }
            }
-#elif INIT_SRC0_SHMEM_Q5_1
+#endif // INIT_SRC0_SHMEM_Q5_0
+
+#if defined(INIT_SRC0_SHMEM_Q5_1)
            let block_byte_base = src0_idx * 24u; // BLOCK_SIZE_BYTES = 24u;

            let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base));
@@ -201,7 +208,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                    shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi;
                }
            }
-#elif INIT_SRC0_SHMEM_Q8_0
+#endif // INIT_SRC0_SHMEM_Q5_1
+
+#if defined(INIT_SRC0_SHMEM_Q8_0)
            let block_byte_base = src0_idx * 34u; // BLOCK_SIZE_BYTES = 34u;
            let d = load_f16_at_src0(block_byte_base);

@@ -211,7 +220,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                let q_packed = load_u32_at_src0(q_byte_offset);
                dequant_q8_0_packed_to_shmem(q_packed, d, shmem_idx + j * BYTES_PER_INNER_LOOP);
            }
-#elif INIT_SRC0_SHMEM_Q8_1
+#endif // INIT_SRC0_SHMEM_Q8_0
+
+#if defined(INIT_SRC0_SHMEM_Q8_1)
            let block_byte_base = src0_idx * 36u; // BLOCK_SIZE_BYTES = 36u;
            let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base));
            let d = f16(dm[0]);
@@ -227,7 +238,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                    shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k] = q_val;
                }
            }
-#elif INIT_SRC0_SHMEM_MXFP4
+#endif // INIT_SRC0_SHMEM_Q8_1
+
+#if defined(INIT_SRC0_SHMEM_MXFP4)
            let block_byte_base = src0_idx * 17u;
            let eu8 = get_byte(load_u32_at_src0_aligned(block_byte_base), block_byte_base & 3u);
            let e = ldexp(1.0, i32(eu8) - 128);
@@ -244,11 +257,11 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
                    shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = f16(q_hi);
                }
            }
-#endif
+#endif // INIT_SRC0_SHMEM_MXFP4
        }
    }
 }
-#endif
+#endif // legacy-quants

 // k-quants
 #if defined(INIT_SRC0_SHMEM_Q2_K) || defined(INIT_SRC0_SHMEM_Q3_K) || defined(INIT_SRC0_SHMEM_Q4_K) || defined(INIT_SRC0_SHMEM_Q5_K) || defined(INIT_SRC0_SHMEM_Q6_K)
@@ -284,7 +297,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3

        let src0_idx = batch_offset + global_m * params.stride_01 + block_k;

-#ifdef INIT_SRC0_SHMEM_Q2_K
+#if defined(INIT_SRC0_SHMEM_Q2_K)
        let block_byte_base  = src0_idx * 84u; // BLOCK_SIZE_BYTES =  84u;
        let scales_byte_base = block_byte_base;
        let qs_byte_base     = block_byte_base + 16u;
@@ -314,7 +327,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        let ml = dmin * f16(scale >> 4u);

        store_shmem_kquants(qs_vec4 * dl - ml, elem_idx);
-#elif INIT_SRC0_SHMEM_Q3_K
+#endif // INIT_SRC0_SHMEM_Q2_K
+
+#if defined(INIT_SRC0_SHMEM_Q3_K)
        let block_byte_base  = src0_idx * 110u; // BLOCK_SIZE_BYTES = 110u;
        let hmask_byte_base  = block_byte_base +  0u;
        let qs_byte_base     = block_byte_base + 32u;
@@ -355,7 +370,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        let dl         = d_all * (f16((scale_hi2 << 4u) | scale_low4) - 32.0);

        store_shmem_kquants(dl * q_vec4, elem_idx);
-#elif INIT_SRC0_SHMEM_Q4_K
+#endif // INIT_SRC0_SHMEM_Q3_K
+
+#if defined(INIT_SRC0_SHMEM_Q4_K)
        let block_byte_base = src0_idx * 144u; // BLOCK_SIZE_BYTES = 144u;
        let dm_byte_base    = block_byte_base +  0u;
        let scale_byte_base = block_byte_base +  4u;
@@ -399,7 +416,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        let ml = dmin * f16(mn);

        store_shmem_kquants(dl * qs_vec4 - vec4(ml, ml, ml, ml), elem_idx);
-#elif INIT_SRC0_SHMEM_Q5_K
+#endif // INIT_SRC0_SHMEM_Q4_K
+
+#if defined(INIT_SRC0_SHMEM_Q5_K)
        let block_byte_base = src0_idx * 176u; // BLOCK_SIZE_BYTES = 176u;
        let dm_byte_base    = block_byte_base +  0u;
        let scale_byte_base = block_byte_base +  4u;
@@ -456,7 +475,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        let ml = dmin * f16(mn);

        store_shmem_kquants((qh_vec4 + qs_lo4_vec4) * dl - vec4<f16>(ml, ml, ml, ml), elem_idx);
-#elif INIT_SRC0_SHMEM_Q6_K
+#endif // INIT_SRC0_SHMEM_Q5_K
+
+#if defined(INIT_SRC0_SHMEM_Q6_K)
        let block_byte_base  = src0_idx * 210u; // BLOCK_SIZE_BYTES = 210u;
        let ql_byte_base     = block_byte_base;
        let qh_byte_base     = block_byte_base + 128u;
@@ -497,17 +518,18 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        let scale      = get_byte_i32(scale_word, scale_byte & 3u);

        store_shmem_kquants(d * q_vec4 * f16(scale), elem_idx);
-#endif
+#endif // INIT_SRC0_SHMEM_Q6_K
    }
 }
 #endif // k-quants

-#ifdef INIT_SRC0_SHMEM_IQ4_NL
+#if defined(INIT_SRC0_SHMEM_IQ4_NL)
 const BLOCK_SIZE = 32u;
 const BLOCK_SIZE_BYTES = 18u;
+const NQ = 4u;

 fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
+    for (var elem_idx = thread_id * NQ; elem_idx < TILE_SRC0_SHMEM; elem_idx += NQ * TOTAL_WORKGROUP_SIZE) {
        let tile_m = elem_idx / TILE_K;
        let tile_k = elem_idx % TILE_K;
        let global_m = offset_m + tile_m;
@@ -519,408 +541,464 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
        }

        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let k_in_block = global_k % BLOCK_SIZE; // k_in_block % 4 == 0;
+
+        let src0_idx = batch_offset + global_m * params.stride_01 + block_k;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_at_src0(block_byte_base);
+        let d_byte_base     = block_byte_base +  0u;
+        let qs_byte_base    = block_byte_base +  2u;

-        let pos       = k_in_block % 16u;
-        let nib_shift = (k_in_block / 16u) * 4u;
-        let q_packed  = load_u32_at_src0(block_byte_base + 2u + (pos / 4u) * 4u);
-        let nib       = (get_byte(q_packed, pos % 4u) >> nib_shift) & 0xFu;
+        let d = load_f16_at_src0(d_byte_base);

-        shmem[elem_idx] = d * f16(kvalues_iq4nl[nib]);
+        let id_qtr      = (k_in_block % 16u) / 4u;
+        let shift_phase = k_in_block / 16u;
+
+        let qs_u32    = load_u32_at_src0(qs_byte_base + 4u * id_qtr);
+
+        shmem[elem_idx + 0u] = d * f16(kvalues_iq4nl[(qs_u32 >> ( 0u + 4u * shift_phase)) & 0xFu]);
+        shmem[elem_idx + 1u] = d * f16(kvalues_iq4nl[(qs_u32 >> ( 8u + 4u * shift_phase)) & 0xFu]);
+        shmem[elem_idx + 2u] = d * f16(kvalues_iq4nl[(qs_u32 >> (16u + 4u * shift_phase)) & 0xFu]);
+        shmem[elem_idx + 3u] = d * f16(kvalues_iq4nl[(qs_u32 >> (24u + 4u * shift_phase)) & 0xFu]);
    }
 }
 #endif // INIT_SRC0_SHMEM_IQ4_NL

-#ifdef INIT_SRC0_SHMEM_IQ4_XS
+// i-quants (super block size: 256)
+#if defined(INIT_SRC0_SHMEM_IQ4_XS) || defined(INIT_SRC0_SHMEM_IQ1_S) || defined(INIT_SRC0_SHMEM_IQ1_M) || defined(INIT_SRC0_SHMEM_IQ2_XXS) \
+|| defined(INIT_SRC0_SHMEM_IQ2_XS) || defined(INIT_SRC0_SHMEM_IQ2_S) || defined(INIT_SRC0_SHMEM_IQ3_XXS) || defined(INIT_SRC0_SHMEM_IQ3_S)
 const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 136u;
+const NQ = 16u;
+
+fn store_shmem_iquants(val: vec4<f16>, idx: u32) {
+    shmem[idx] = val.x;
+    shmem[idx + 1] = val.y;
+    shmem[idx + 2] = val.z;
+    shmem[idx + 3] = val.w;
+}
+
+fn load_byte_at_src0_aligned(byte_offset: u32) -> u32 {
+    return get_byte(load_u32_at_src0_aligned(byte_offset), byte_offset % 4u);
+}
+
+#if defined(INIT_SRC0_SHMEM_IQ1_M) || defined(INIT_SRC0_SHMEM_IQ1_S)
+fn create_iq_gw4(dl: f32, gw: u32, shift_base: u32, delta: f32) -> vec4<f16> {
+    return vec4<f16>(
+            f16(dl * (f32((bitcast<i32>(((gw >> (shift_base + 0u)) & 3u) << 30u) >> 30u)) + delta)),
+            f16(dl * (f32((bitcast<i32>(((gw >> (shift_base + 2u)) & 3u) << 30u) >> 30u)) + delta)),
+            f16(dl * (f32((bitcast<i32>(((gw >> (shift_base + 4u)) & 3u) << 30u) >> 30u)) + delta)),
+            f16(dl * (f32((bitcast<i32>(((gw >> (shift_base + 6u)) & 3u) << 30u) >> 30u)) + delta)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ4_XS)
+fn create_iq_gw4(dl: f16, qs_u32: u32, shift_phase: u32) -> vec4<f16> {
+    return vec4<f16>(
+            dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase +  0u)) & 0xFu]),
+            dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase +  8u)) & 0xFu]),
+            dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase + 16u)) & 0xFu]),
+            dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase + 24u)) & 0xFu]),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ2_XXS)
+fn create_iq_gw4(ig: u32, grid_phase: u32) -> vec4<f32> {
+    return vec4<f32>(
+            f32(get_byte(iq2xxs_grid[(ig + grid_phase + 0u) / 4u], (ig + grid_phase + 0u) % 4u)),
+            f32(get_byte(iq2xxs_grid[(ig + grid_phase + 1u) / 4u], (ig + grid_phase + 1u) % 4u)),
+            f32(get_byte(iq2xxs_grid[(ig + grid_phase + 2u) / 4u], (ig + grid_phase + 2u) % 4u)),
+            f32(get_byte(iq2xxs_grid[(ig + grid_phase + 3u) / 4u], (ig + grid_phase + 3u) % 4u)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ2_XS)
+fn create_iq_gw4(ig: u32, grid_phase: u32) -> vec4<f32> {
+    return vec4<f32>(
+            f32(get_byte(iq2xs_grid[(ig + grid_phase + 0u) / 4u], (ig + grid_phase + 0u) % 4u)),
+            f32(get_byte(iq2xs_grid[(ig + grid_phase + 1u) / 4u], (ig + grid_phase + 1u) % 4u)),
+            f32(get_byte(iq2xs_grid[(ig + grid_phase + 2u) / 4u], (ig + grid_phase + 2u) % 4u)),
+            f32(get_byte(iq2xs_grid[(ig + grid_phase + 3u) / 4u], (ig + grid_phase + 3u) % 4u)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ2_S)
+fn create_iq_gw4(ig: u32, grid_phase: u32) -> vec4<f32> {
+    return vec4<f32>(
+            f32(get_byte(iq2s_grid[(ig + grid_phase + 0u) / 4u], (ig + grid_phase + 0u) % 4u)),
+            f32(get_byte(iq2s_grid[(ig + grid_phase + 1u) / 4u], (ig + grid_phase + 1u) % 4u)),
+            f32(get_byte(iq2s_grid[(ig + grid_phase + 2u) / 4u], (ig + grid_phase + 2u) % 4u)),
+            f32(get_byte(iq2s_grid[(ig + grid_phase + 3u) / 4u], (ig + grid_phase + 3u) % 4u)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ3_XXS)
+fn create_iq_gw4(ig: u32) -> vec4<f32> {
+    return vec4<f32>(
+            f32(get_byte(iq3xxs_grid[ig], 0)),
+            f32(get_byte(iq3xxs_grid[ig], 1)),
+            f32(get_byte(iq3xxs_grid[ig], 2)),
+            f32(get_byte(iq3xxs_grid[ig], 3)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ3_S)
+fn create_iq_gw4(ig: u32) -> vec4<f32> {
+    return vec4<f32>(
+            f32(get_byte(iq3s_grid[ig], 0)),
+            f32(get_byte(iq3s_grid[ig], 1)),
+            f32(get_byte(iq3s_grid[ig], 2)),
+            f32(get_byte(iq3s_grid[ig], 3)),
+        );
+}
+#endif
+
+#if defined(INIT_SRC0_SHMEM_IQ2_XXS) || defined(INIT_SRC0_SHMEM_IQ2_XS) || defined(INIT_SRC0_SHMEM_IQ2_S) \
+|| defined(INIT_SRC0_SHMEM_IQ3_XXS) || defined(INIT_SRC0_SHMEM_IQ3_S)
+fn create_iq2_m4(signs: u32, mask_phase: u32) -> vec4<f32> {
+    return vec4<f32>(
+            select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 0) & signs) != 0u),
+            select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 1) & signs) != 0u),
+            select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 2) & signs) != 0u),
+            select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 3) & signs) != 0u),
+        );
+}
+#endif

 fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
+    for (var elem_idx = thread_id * NQ; elem_idx < TILE_SRC0_SHMEM; elem_idx += NQ * TOTAL_WORKGROUP_SIZE) {
        let tile_m = elem_idx / TILE_K;
        let tile_k = elem_idx % TILE_K;
        let global_m = offset_m + tile_m;
        let global_k = k_outer + tile_k;

        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
+            let zero_vec4 = vec4<f16>(f16(0.0), f16(0.0), f16(0.0), f16(0.0));
+            store_shmem_iquants(zero_vec4, elem_idx +  0u);
+            store_shmem_iquants(zero_vec4, elem_idx +  4u);
+            store_shmem_iquants(zero_vec4, elem_idx +  8u);
+            store_shmem_iquants(zero_vec4, elem_idx + 12u);
            continue;
        }

        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let k_in_block = global_k % BLOCK_SIZE; // k_in_block % 16 == 0;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
+        let src0_idx = batch_offset + global_m * params.stride_01 + block_k;

-        let d_scales_h = load_u32_at_src0(block_byte_base);
+#if defined(INIT_SRC0_SHMEM_IQ4_XS)
+        let block_byte_base    = src0_idx * 136u; // BLOCK_SIZE_BYTES = 136u;
+        let d_byte_base        = block_byte_base +  0u;
+        let scales_l_byte_base = block_byte_base +  4u;
+        let qs_byte_base       = block_byte_base +  8u;
+
+        let d_scales_h = load_u32_at_src0_aligned(d_byte_base);
        let d          = bitcast<vec2<f16>>(d_scales_h).x;
        let scales_h   = d_scales_h >> 16u;

-        let ib  = k_in_block / 32u;
-        let pos = k_in_block % 32u;
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let scales_l_word = load_u32_at_src0(block_byte_base + 4u);
-        let ls_lo         = (get_byte(scales_l_word, ib / 2u) >> ((ib & 1u) * 4u)) & 0xFu;
-        let ls_hi         = ((scales_h >> (2u * ib)) & 3u) << 4u;
-        let dl            = d * f16(i32(ls_lo | ls_hi) - 32);
+        let scales_l_u32 = load_u32_at_src0_aligned(scales_l_byte_base);
+        let ls_lo        = (get_byte(scales_l_u32, sub_block / 2u) >> (4u * (sub_block % 2u))) & 0xFu;
+        let ls_hi        = ((scales_h >> (2u * sub_block)) & 3u) << 4u;
+        let dl           = d * f16(i32(ls_lo | ls_hi) - 32);

-        let iqs       = ib * 16u + (pos % 16u);
-        let nib_shift = (pos / 16u) * 4u;
-        let q_packed  = load_u32_at_src0(block_byte_base + 8u + (iqs / 4u) * 4u);
-        let nib       = (get_byte(q_packed, iqs % 4u) >> nib_shift) & 0xFu;
+        let qs_0_3_u32   = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block +  0u);
+        let qs_4_7_u32   = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block +  4u);
+        let qs_8_11_u32  = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block +  8u);
+        let qs_12_15_u32 = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block + 12u);

-        shmem[elem_idx] = dl * f16(kvalues_iq4nl[nib]);
-    }
-}
+        store_shmem_iquants(create_iq_gw4(dl, qs_0_3_u32,   phase), elem_idx +  0u);
+        store_shmem_iquants(create_iq_gw4(dl, qs_4_7_u32,   phase), elem_idx +  4u);
+        store_shmem_iquants(create_iq_gw4(dl, qs_8_11_u32,  phase), elem_idx +  8u);
+        store_shmem_iquants(create_iq_gw4(dl, qs_12_15_u32, phase), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ4_XS

-#ifdef INIT_SRC0_SHMEM_IQ1_S
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 50u;
+#if defined(INIT_SRC0_SHMEM_IQ1_S)
+        let block_byte_base = src0_idx * 50u; // BLOCK_SIZE_BYTES = 50u;
+        let d_byte_base     = block_byte_base +  0u;
+        let qs_byte_base    = block_byte_base +  2u;
+        let qh_byte_base    = block_byte_base + 34u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let qh_u16 = load_u32_at_src0(qh_byte_base + sub_block * 2u) & 0xFFFFu;
+        let qs_u16 = load_u32_at_src0(qs_byte_base + sub_block * 4u + phase * 2u) & 0xFFFFu;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
+        let dl    = d * (2.0 * f32((qh_u16 >> 12u) & 7u) + 1.0);
+        let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh_u16 & 0x8000u) != 0u);

-        let ib  = k_in_block / 32u;
-        let pos = k_in_block % 32u;
-        let l   = pos / 8u;
-        let j   = pos % 8u;
+        let gp0_grid_id = ((qs_u16 & 0xFFu) | (((qh_u16 >> (phase * 6u)) & 7u) << 8u)) * 8u;
+        let gp1_grid_id = (((qs_u16 >> 8) & 0xFFu) | (((qh_u16 >> (phase * 6u + 3u)) & 7u) << 8u)) * 8u;

-        let qh    = load_u32_at_src0(block_byte_base + 34u + ib * 2u) & 0xFFFFu;
-        let dl    = d * (2.0 * f32((qh >> 12u) & 7u) + 1.0);
-        let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000u) != 0u);
+        let gp0_gw = iq1_grid[(gp0_grid_id) / 16u];
+        let gp1_gw = iq1_grid[(gp1_grid_id) / 16u];

-        let qs_w = load_u32_at_src0(block_byte_base + 2u + ib * 4u);
-        let ig   = (get_byte(qs_w, l) | (((qh >> (3u * l)) & 7u) << 8u)) * 8u;
+        let gp0_shift_base = (gp0_grid_id % 16u) * 2u;
+        let gp1_shift_base = (gp1_grid_id % 16u) * 2u;

-        let gw = iq1_grid[(ig + j) / 16u];
-        let g  = (gw >> (((ig + j) % 16u) * 2u)) & 3u;
-        let gs = bitcast<i32>(g << 30u) >> 30u;
-
-        shmem[elem_idx] = f16(dl * (f32(gs) + delta));
-    }
-}
+        store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 0u, delta), elem_idx +  0u);
+        store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 8u, delta), elem_idx +  4u);
+        store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 0u, delta), elem_idx +  8u);
+        store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 8u, delta), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ1_S

-#ifdef INIT_SRC0_SHMEM_IQ1_M
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 56u;
+#if defined(INIT_SRC0_SHMEM_IQ1_M)
+        let block_byte_base  = src0_idx * 56u; // BLOCK_SIZE_BYTES = 56u;
+        let qs_byte_base     = block_byte_base +  0u;
+        let qh_byte_base     = block_byte_base + 32u;
+        let scales_byte_base = block_byte_base + 48u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
-
-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
-
-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
-
-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-
-        let scales0 = load_u32_at_src0(block_byte_base + 48u);
-        let scales1 = load_u32_at_src0(block_byte_base + 52u);
+        let scales0      = load_u32_at_src0_aligned(scales_byte_base);
+        let scales1      = load_u32_at_src0_aligned(scales_byte_base + 4u);
        let scale_packed = ((scales0 >> 12u) & 0xFu) |
                           ((scales0 >> 24u) & 0x00F0u) |
                           ((scales1 >>  4u) & 0x0F00u) |
                           ((scales1 >> 16u) & 0xF000u);
        let d = f32(bitcast<vec2<f16>>(scale_packed).x);

-        let ib  = k_in_block / 32u;
-        let pos = k_in_block % 32u;
-        let l   = pos / 8u;
-        let j   = pos % 8u;
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let scales = select(scales0, scales1, ib >= 4u);
-        let sw = (scales >> (16u * ((ib / 2u) % 2u))) & 0xFFFFu;
-        let s_pair = (sw >> (6u * (ib % 2u) + 3u * (l / 2u))) & 0x7u;
-        let dl     = d * f32(2u * s_pair + 1u);
+        let scale_u32 = select(scales0, scales1, sub_block >= 4u);
+        let scale_u3  = (scale_u32 >> (16u * ((sub_block / 2u) % 2u) + 6u * (sub_block % 2u) + 3u * phase)) & 0x7u;
+        let dl        = d * f32(2u * scale_u3 + 1u);

-        let qh_word = load_u32_at_src0(block_byte_base + 32u + (ib / 2u) * 4u);
-        let qh      = qh_word >> (16u * (ib % 2u));
-        let qh_nib  = (qh >> (4u * l)) & 0xFu;
+        let qh_u8  = (load_u32_at_src0_aligned(qh_byte_base + 4u * (sub_block / 2u)) >> (16u * (sub_block % 2u) + 8u * phase)) & 0xFFu;
+        let qs_u16 = (load_u32_at_src0_aligned(qs_byte_base + 4u * sub_block) >> (16u * phase)) & 0xFFFFu;

-        let qs_w = load_u32_at_src0(block_byte_base + ib * 4u);
-        let idx  = get_byte(qs_w, l) | ((qh_nib & 7u) << 8u);
-        let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh_nib & 0x8u) != 0u);
+        let gp0_grid_id = ((qs_u16 & 0xFFu) | ((qh_u8 & 7u) << 8u)) * 8u;
+        let gp0_delta   = select(IQ1_DELTA, -IQ1_DELTA, (qh_u8 & 0x8u) != 0u);

-        let ig = idx * 8u;
-        let gw = iq1_grid[(ig + j) / 16u];
-        let g  = (gw >> (((ig + j) % 16u) * 2u)) & 3u;
-        let gs = bitcast<i32>(g << 30u) >> 30u;
+        let gp1_grid_id = (((qs_u16 >> 8u) & 0xFFu) | (((qh_u8 >> 4u) & 7u) << 8u)) * 8u;
+        let gp1_delta   = select(IQ1_DELTA, -IQ1_DELTA, (qh_u8 & 0x80u) != 0u);

-        shmem[elem_idx] = f16(dl * (f32(gs) + delta));
-    }
-}
+        let gp0_gw = iq1_grid[(gp0_grid_id) / 16u];
+        let gp1_gw = iq1_grid[(gp1_grid_id) / 16u];
+
+        let gp0_shift_base = (gp0_grid_id % 16u) * 2u;
+        let gp1_shift_base = (gp1_grid_id % 16u) * 2u;
+
+        store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 0u, gp0_delta), elem_idx +  0u);
+        store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 8u, gp0_delta), elem_idx +  4u);
+        store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 0u, gp1_delta), elem_idx +  8u);
+        store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 8u, gp1_delta), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ1_M

-#ifdef INIT_SRC0_SHMEM_IQ2_XXS
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 66u;
+#if defined(INIT_SRC0_SHMEM_IQ2_XXS)
+        let block_byte_base = src0_idx * 66u; // BLOCK_SIZE_BYTES = 66u;
+        let d_byte_base     = block_byte_base +  0u;
+        let qs_byte_base    = block_byte_base +  2u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
-
-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
-
-        let entry_idx = k_in_block / 8u;
-        let j         = k_in_block % 8u;
-
-        let ib = entry_idx & ~3u;
-        let l  = entry_idx & 3u;
-
-        let aux0 = load_u32_at_src0(block_byte_base + 2u + ib * 2u);
-        let aux1 = load_u32_at_src0(block_byte_base + 2u + (ib + 2u) * 2u);
+        let aux0 = load_u32_at_src0(qs_byte_base + 8u * sub_block +  0u);
+        let aux1 = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u);
        let db   = d * (0.5 + f32(aux1 >> 28u)) * 0.25;

-        let ig    = get_byte(aux0, l) * 8u;
-        let is    = (aux1 >> (7u * l)) & 127u;
-        let signs = get_byte(ksigns_iq2xs[is / 4u], is % 4u);
+        let gp0_ig = get_byte(aux0, 2u * phase + 0u) * 8u;
+        let gp1_ig = get_byte(aux0, 2u * phase + 1u) * 8u;

-        let g = get_byte(iq2xxs_grid[(ig + j) / 4u], (ig + j) % 4u);
-        let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4u], j % 4u) & signs) != 0u);
+        let gp0_is = (aux1 >> (14u * phase + 0u)) & 127u;
+        let gp1_is = (aux1 >> (14u * phase + 7u)) & 127u;

-        shmem[elem_idx] = f16(db * f32(g) * m);
-    }
-}
+        let gp0_signs = get_byte(ksigns_iq2xs[gp0_is / 4u], gp0_is % 4u);
+        let gp1_signs = get_byte(ksigns_iq2xs[gp1_is / 4u], gp1_is % 4u);
+
+        let m_0_3_val4   = create_iq2_m4(gp0_signs, 0);
+        let m_4_7_val4   = create_iq2_m4(gp0_signs, 1);
+        let m_8_11_val4  = create_iq2_m4(gp1_signs, 0);
+        let m_12_15_val4 = create_iq2_m4(gp1_signs, 1);
+
+        let gw_0_3_val4   = create_iq_gw4(gp0_ig, 0);
+        let gw_4_7_val4   = create_iq_gw4(gp0_ig, 4);
+        let gw_8_11_val4  = create_iq_gw4(gp1_ig, 0);
+        let gw_12_15_val4 = create_iq_gw4(gp1_ig, 4);
+
+        store_shmem_iquants(vec4<f16>(db * m_0_3_val4 * gw_0_3_val4),     elem_idx +  0u);
+        store_shmem_iquants(vec4<f16>(db * m_4_7_val4 * gw_4_7_val4),     elem_idx +  4u);
+        store_shmem_iquants(vec4<f16>(db * m_8_11_val4 * gw_8_11_val4),   elem_idx +  8u);
+        store_shmem_iquants(vec4<f16>(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ2_XXS

-#ifdef INIT_SRC0_SHMEM_IQ2_XS
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 74u;
+#if defined(INIT_SRC0_SHMEM_IQ2_XS)
+        let block_byte_base  = src0_idx * 74u; // BLOCK_SIZE_BYTES = 74u;
+        let d_byte_base      = block_byte_base +  0u;
+        let qs_byte_base     = block_byte_base +  2u;
+        let scales_byte_base = block_byte_base + 66u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let scale = (load_byte_at_src0_aligned(scales_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu;
+        let db    = d * (0.5 + f32(scale)) * 0.25;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
+        let qs_u32 = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u * phase);

-        let entry_idx = k_in_block / 8u;
-        let j         = k_in_block % 8u;
+        let gp0_ig = (qs_u32 & 0x1FFu) * 8u;
+        let gp1_ig = ((qs_u32 >> 16u) & 0x1FFu) * 8u;

-        let ib = entry_idx & ~3u;
-        let l  = entry_idx & 3u;
+        let gp0_is = (qs_u32 >>  9u) & 0x7Fu;
+        let gp1_is = (qs_u32 >> 25u) & 0x7Fu;

-        let scales_word = load_u32_at_src0(block_byte_base + 66u + (ib / 16u) * 4u);
-        let s           = get_byte(scales_word, (ib % 16u) / 4u);
-        let s_nib       = select(s & 0xFu, (s >> 4u) & 0xFu, (l / 2u) != 0u);
-        let dl          = d * (0.5 + f32(s_nib)) * 0.25;
+        let gp0_signs = get_byte(ksigns_iq2xs[gp0_is / 4u], gp0_is % 4u);
+        let gp1_signs = get_byte(ksigns_iq2xs[gp1_is / 4u], gp1_is % 4u);

-        let qs_word = load_u32_at_src0(block_byte_base + 2u + (ib + l) * 2u);
-        let qs_val  = qs_word & 0xFFFFu;
-        let ig      = (qs_val & 511u) * 8u;
-        let is      = qs_val >> 9u;
-        let signs   = get_byte(ksigns_iq2xs[is / 4u], is % 4u);
+        let m_0_3_val4   = create_iq2_m4(gp0_signs, 0);
+        let m_4_7_val4   = create_iq2_m4(gp0_signs, 1);
+        let m_8_11_val4  = create_iq2_m4(gp1_signs, 0);
+        let m_12_15_val4 = create_iq2_m4(gp1_signs, 1);

-        let g = get_byte(iq2xs_grid[(ig + j) / 4u], (ig + j) % 4u);
-        let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4u], j % 4u) & signs) != 0u);
+        let gw_0_3_val4   = create_iq_gw4(gp0_ig, 0);
+        let gw_4_7_val4   = create_iq_gw4(gp0_ig, 4);
+        let gw_8_11_val4  = create_iq_gw4(gp1_ig, 0);
+        let gw_12_15_val4 = create_iq_gw4(gp1_ig, 4);

-        shmem[elem_idx] = f16(dl * f32(g) * m);
-    }
-}
+        store_shmem_iquants(vec4<f16>(db * m_0_3_val4 * gw_0_3_val4),     elem_idx +  0u);
+        store_shmem_iquants(vec4<f16>(db * m_4_7_val4 * gw_4_7_val4),     elem_idx +  4u);
+        store_shmem_iquants(vec4<f16>(db * m_8_11_val4 * gw_8_11_val4),   elem_idx +  8u);
+        store_shmem_iquants(vec4<f16>(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ2_XS

-#ifdef INIT_SRC0_SHMEM_IQ2_S
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 82u;
+#if defined(INIT_SRC0_SHMEM_IQ2_S)
+        let block_byte_base  = src0_idx * 82u; // BLOCK_SIZE_BYTES = 82u;
+        let d_byte_base      = block_byte_base +  0u;
+        let qs_byte_base     = block_byte_base +  2u;
+        let qh_byte_base     = block_byte_base + 66u;
+        let scales_byte_base = block_byte_base + 74u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let scale = (load_byte_at_src0_aligned(scales_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu;
+        let db    = d * (0.5 + f32(scale)) * 0.25;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
+        let qs_u16    = load_u32_at_src0(qs_byte_base + 4u * sub_block + 2u * phase) & 0xFFFFu;
+        let signs_u16 = load_u32_at_src0(qs_byte_base + 32u + 4u * sub_block + 2u * phase) & 0xFFFFu;
+        let qh_u4     = (load_byte_at_src0_aligned(qh_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu;

-        let ib = k_in_block / 32u;
-        let l  = (k_in_block % 32u) / 8u;
-        let j  = k_in_block % 8u;
+        let gp0_ig = ((qs_u16 & 0xFFu) | ((qh_u4 & 0x3u) << 8u)) * 8u;
+        let gp1_ig = (((qs_u16 >> 8u) & 0xFFu) | ((qh_u4 & 0xCu) << 6u)) * 8u;

-        let scales_word = load_u32_at_src0(block_byte_base + 74u + (ib / 4u) * 4u);
-        let s           = get_byte(scales_word, ib % 4u);
-        let s_nib       = select(s & 0xFu, (s >> 4u) & 0xFu, (l / 2u) != 0u);
-        let dl          = d * (0.5 + f32(s_nib)) * 0.25;
+        let gp0_signs = get_byte(signs_u16, 0);
+        let gp1_signs = get_byte(signs_u16, 1);

-        let qs_word = load_u32_at_src0(block_byte_base + 2u + ib * 4u);
-        let qh_word = load_u32_at_src0(block_byte_base + 66u + (ib / 4u) * 4u);
-        let qh_b    = (get_byte(qh_word, ib % 4u) << (8u - 2u * l)) & 0x300u;
-        let ig      = (get_byte(qs_word, l) | qh_b) * 8u;
+        let m_0_3_val4   = create_iq2_m4(gp0_signs, 0);
+        let m_4_7_val4   = create_iq2_m4(gp0_signs, 1);
+        let m_8_11_val4  = create_iq2_m4(gp1_signs, 0);
+        let m_12_15_val4 = create_iq2_m4(gp1_signs, 1);

-        let signs_word = load_u32_at_src0(block_byte_base + 34u + ib * 4u);
-        let signs      = get_byte(signs_word, l);
+        let gw_0_3_val4   = create_iq_gw4(gp0_ig, 0);
+        let gw_4_7_val4   = create_iq_gw4(gp0_ig, 4);
+        let gw_8_11_val4  = create_iq_gw4(gp1_ig, 0);
+        let gw_12_15_val4 = create_iq_gw4(gp1_ig, 4);

-        let g = get_byte(iq2s_grid[(ig + j) / 4u], (ig + j) % 4u);
-        let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4u], j % 4u) & signs) != 0u);
-
-        shmem[elem_idx] = f16(dl * f32(g) * m);
-    }
-}
+        store_shmem_iquants(vec4<f16>(db * m_0_3_val4 * gw_0_3_val4),     elem_idx +  0u);
+        store_shmem_iquants(vec4<f16>(db * m_4_7_val4 * gw_4_7_val4),     elem_idx +  4u);
+        store_shmem_iquants(vec4<f16>(db * m_8_11_val4 * gw_8_11_val4),   elem_idx +  8u);
+        store_shmem_iquants(vec4<f16>(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ2_S

-#ifdef INIT_SRC0_SHMEM_IQ3_XXS
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 98u;
+#if defined(INIT_SRC0_SHMEM_IQ3_XXS)
+        let block_byte_base = src0_idx * 98u; // BLOCK_SIZE_BYTES = 98u;
+        let d_byte_base     = block_byte_base +  0u;
+        let qs_byte_base    = block_byte_base +  2u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let qs_u32   = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u * phase);
+        let sign_u32 = load_u32_at_src0(qs_byte_base + 64u + 4u * sub_block);
+        let db       = d * (0.5 + f32(sign_u32 >> 28u)) * 0.5;

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
+        let ig_0_3   = get_byte(qs_u32, 0);
+        let ig_4_7   = get_byte(qs_u32, 1);
+        let ig_8_11  = get_byte(qs_u32, 2);
+        let ig_12_15 = get_byte(qs_u32, 3);

-        let ib_pair = k_in_block / 32u;
-        let in_pair = k_in_block % 32u;
-        let l       = in_pair / 8u;
-        let in_l    = in_pair % 8u;
-        let k2      = in_l / 4u;
-        let j       = in_l % 4u;
+        let gp0_is = (sign_u32 >> (14u * phase + 0u)) & 0x7Fu;
+        let gp1_is = (sign_u32 >> (14u * phase + 7u)) & 0x7Fu;

-        let ib            = ib_pair * 2u;
-        let sc_sign_off   = block_byte_base + 2u + (ib + 32u) * 2u;
-        let sc_sign       = load_u32_at_src0(sc_sign_off);
-        let db            = d * (0.5 + f32(sc_sign >> 28u)) * 0.5;
-        let is            = (sc_sign >> (7u * l)) & 127u;
-        let signs         = get_byte(ksigns_iq2xs[is / 4u], is % 4u);
+        let gp0_signs = get_byte(ksigns_iq2xs[gp0_is / 4u], gp0_is % 4u);
+        let gp1_signs = get_byte(ksigns_iq2xs[gp1_is / 4u], gp1_is % 4u);

-        let ig_word = load_u32_at_src0(block_byte_base + 2u + (ib * 2u + l) * 2u) & 0xFFFFu;
-        let ig_byte = get_byte(ig_word, k2);
-        let g       = get_byte(iq3xxs_grid[ig_byte], j);
-        let m       = select(1.0, -1.0, (get_byte(kmask_iq2xs[k2], j) & signs) != 0u);
+        let m_0_3_val4   = create_iq2_m4(gp0_signs, 0);
+        let m_4_7_val4   = create_iq2_m4(gp0_signs, 1);
+        let m_8_11_val4  = create_iq2_m4(gp1_signs, 0);
+        let m_12_15_val4 = create_iq2_m4(gp1_signs, 1);

-        shmem[elem_idx] = f16(db * f32(g) * m);
-    }
-}
+        let gw_0_3_val4   = create_iq_gw4(ig_0_3);
+        let gw_4_7_val4   = create_iq_gw4(ig_4_7);
+        let gw_8_11_val4  = create_iq_gw4(ig_8_11);
+        let gw_12_15_val4 = create_iq_gw4(ig_12_15);
+
+        store_shmem_iquants(vec4<f16>(db * m_0_3_val4 * gw_0_3_val4),     elem_idx +  0u);
+        store_shmem_iquants(vec4<f16>(db * m_4_7_val4 * gw_4_7_val4),     elem_idx +  4u);
+        store_shmem_iquants(vec4<f16>(db * m_8_11_val4 * gw_8_11_val4),   elem_idx +  8u);
+        store_shmem_iquants(vec4<f16>(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u);
 #endif // INIT_SRC0_SHMEM_IQ3_XXS

-#ifdef INIT_SRC0_SHMEM_IQ3_S
-const BLOCK_SIZE = 256u;
-const BLOCK_SIZE_BYTES = 110u;
+#if defined(INIT_SRC0_SHMEM_IQ3_S)
+        let block_byte_base  = src0_idx * 110u; // BLOCK_SIZE_BYTES = 110u;
+        let d_byte_base      = block_byte_base +   0u;
+        let qs_byte_base     = block_byte_base +   2u;
+        let qh_byte_base     = block_byte_base +  66u;
+        let signs_byte_base  = block_byte_base +  74u;
+        let scales_byte_base = block_byte_base + 106u;

-fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
-    for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
-        let tile_m = elem_idx / TILE_K;
-        let tile_k = elem_idx % TILE_K;
-        let global_m = offset_m + tile_m;
-        let global_k = k_outer + tile_k;
+        let d = load_f16_as_f32_at_src0(d_byte_base);

-        if (global_m >= params.m || global_k >= params.k) {
-            shmem[elem_idx] = f16(0.0);
-            continue;
-        }
+        let sub_block = k_in_block / 32u;
+        let phase     = (k_in_block / NQ) % 2u;

-        let block_k    = global_k / BLOCK_SIZE;
-        let k_in_block = global_k % BLOCK_SIZE;
+        let scale = (load_byte_at_src0_aligned(scales_byte_base + 1u * (sub_block / 2u)) >> (4u * (sub_block % 2u))) & 0xFu;
+        let db    = d * (1.0 + 2.0 * f32(scale));

-        let src0_idx        = batch_offset + global_m * params.stride_01 + block_k;
-        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
-        let d               = load_f16_as_f32_at_src0(block_byte_base);
+        let qs_u32    = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u * phase);
+        let qh_u4     = (load_byte_at_src0_aligned(qh_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu;
+        let signs_u16 = (load_u32_at_src0(signs_byte_base + 4u * sub_block + 2u * phase)) & 0xFFFFu;

-        let ib   = k_in_block / 64u;
-        let rest = k_in_block % 64u;
-        let k    = rest / 32u;
-        let in_k = rest % 32u;
-        let l    = in_k / 8u;
-        let in_l = in_k % 8u;
-        let k2   = in_l / 4u;
-        let j    = in_l % 4u;
+        let ig_0_3   = ((qs_u32 >>  0u) & 0xFFu) | ((qh_u4 & 0x1u) << 8u);
+        let ig_4_7   = ((qs_u32 >>  8u) & 0xFFu) | ((qh_u4 & 0x2u) << 7u);
+        let ig_8_11  = ((qs_u32 >> 16u) & 0xFFu) | ((qh_u4 & 0x4u) << 6u);
+        let ig_12_15 = ((qs_u32 >> 24u) & 0xFFu) | ((qh_u4 & 0x8u) << 5u);

-        let scales_word = load_u32_at_src0(block_byte_base + 106u);
-        let s           = get_byte(scales_word, ib);
-        let s_nib       = select(s & 0xFu, (s >> 4u) & 0xFu, k != 0u);
-        let dl          = d * (1.0 + 2.0 * f32(s_nib));
+        let gp0_signs = get_byte(signs_u16, 0);
+        let gp1_signs = get_byte(signs_u16, 1);

-        let qh_word = load_u32_at_src0(block_byte_base + 66u + (ib / 2u) * 4u);
-        let qh_byte = get_byte(qh_word, (ib % 2u) * 2u + k);
+        let m_0_3_val4   = create_iq2_m4(gp0_signs, 0);
+        let m_4_7_val4   = create_iq2_m4(gp0_signs, 1);
+        let m_8_11_val4  = create_iq2_m4(gp1_signs, 0);
+        let m_12_15_val4 = create_iq2_m4(gp1_signs, 1);

-        let ig_word = load_u32_at_src0(block_byte_base + 2u + (ib * 8u + k * 4u + l) * 2u) & 0xFFFFu;
-        let ig_lo   = get_byte(ig_word, 0u) | ((qh_byte << (8u - 2u * l)) & 256u);
-        let ig_hi   = get_byte(ig_word, 1u) | ((qh_byte << (7u - 2u * l)) & 256u);
-        let ig      = select(ig_lo, ig_hi, k2 != 0u);
+        let gw_0_3_val4   = create_iq_gw4(ig_0_3);
+        let gw_4_7_val4   = create_iq_gw4(ig_4_7);
+        let gw_8_11_val4  = create_iq_gw4(ig_8_11);
+        let gw_12_15_val4 = create_iq_gw4(ig_12_15);

-        let signs_word = load_u32_at_src0(block_byte_base + 74u + (ib * 2u + k) * 4u);
-        let signs      = get_byte(signs_word, l);
-
-        let g = get_byte(iq3s_grid[ig], j);
-        let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[k2], j) & signs) != 0u);
-
-        shmem[elem_idx] = f16(dl * f32(g) * m);
+        store_shmem_iquants(vec4<f16>(db * m_0_3_val4 * gw_0_3_val4),     elem_idx +  0u);
+        store_shmem_iquants(vec4<f16>(db * m_4_7_val4 * gw_4_7_val4),     elem_idx +  4u);
+        store_shmem_iquants(vec4<f16>(db * m_8_11_val4 * gw_8_11_val4),   elem_idx +  8u);
+        store_shmem_iquants(vec4<f16>(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u);
+#endif // INIT_SRC0_SHMEM_IQ3_S
    }
 }
-#endif // INIT_SRC0_SHMEM_IQ3_S
+#endif // i-quants (super block size: 256)
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -457,6 +457,7 @@ class MODEL_ARCH(IntEnum):
    XVERSE           = auto()
    COMMAND_R        = auto()
    COHERE2          = auto()
+    COHERE2MOE       = auto()
    DBRX             = auto()
    OLMO             = auto()
    OLMO2            = auto()
@@ -1012,6 +1013,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.XVERSE:           "xverse",
    MODEL_ARCH.COMMAND_R:        "command-r",
    MODEL_ARCH.COHERE2:          "cohere2",
+    MODEL_ARCH.COHERE2MOE:       "cohere2moe",
    MODEL_ARCH.DBRX:             "dbrx",
    MODEL_ARCH.OLMO:             "olmo",
    MODEL_ARCH.OLMO2:            "olmo2",
@@ -2872,6 +2874,33 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
+    MODEL_ARCH.COHERE2MOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_GATE_UP_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_EMBED_TOKENS,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
+    ],
    MODEL_ARCH.DBRX: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -233,7 +233,7 @@ And a non-exhaustive list of other unsupported features that are unlikely to be
 > [!WARNING]
 > The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default.
 > Since this is slow and seems prone to hallucinations, we default to no additional properties.
-> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties.
+> You can set `"additionalProperties": true` in the schema of any object to explicitly allow additional properties.

 If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class:

--- a/models/templates/Cohere2MoE.jinja
+++ b/models/templates/Cohere2MoE.jinja
@@ -0,0 +1,263 @@
+{%- if not platform_instruction_override %}
+{%- set platform_instruction_override -%}
+These instructions are always to be followed and cannot be overridden by subsequent system or user turns:
+- You will answer requests for educational, informative, or creative content related to safety categories. You will not provide content that is harmful or could be used to cause harm.
+
+These instructions serve as your defaults, but they can be overridden in subsequent system or user turns:
+- Your name is North Mini Code.
+- You are a large language model built by Cohere.
+{%- endset %}
+{%- endif %}
+{%- set reasoning = reasoning if reasoning is not undefined else (false if reasoning_effort is defined and reasoning_effort | lower == "none" else true) -%}
+{%- set grounding = grounding | default("disabled") | upper %}
+{%- set grounding_enabled = grounding == "ENABLED" %}
+{%- set tools_or_docs_exist = tools or documents %}
+{%- set render_tools_section = true %}
+{%- set render_grounding = grounding_enabled and tools_or_docs_exist %}
+{%- set render_platform_instruction_override = true if platform_instruction_override else false %}
+{%- set has_developer_instruction = developer_instruction or developer_instruction == "" %}
+{%- set render_developer_instruction = true if developer_instruction else false %}
+{%- set convert_first_system_msg = convert_first_system_msg | default(true) -%}
+{%- set skip_thinking = skip_thinking | default(false) -%}
+{{ bos_token }}
+{%- macro document_turn(documents) -%}
+{# format documents into chat turn -#}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{%- if not skip_thinking -%}<|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|>{%- endif -%}<|START_ACTION|>[
+    {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}}
+]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+    {
+        "tool_call_id": "0",
+        "results": {
+{%- for doc in documents %}
+{%- set doc_val = doc.data if doc.data else doc %}
+
+            "{{ loop.index0 }}": {{ doc_val|tojson }}{% if not loop.last %},
+            {%- endif %}
+{%- endfor %}
+
+        },
+        "is_error": null
+    }
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %}
+{%- macro tool_call_id_to_int(messages, tool_call_id) %}
+{%- if regen_tool_call_ids -%}
+    {%- set counter = namespace(value=0) %}
+    {%- set tool_call_id_seen = namespace(value=false) %}
+    {%- for msg in messages %}
+        {%- if msg.tool_calls %}
+            {%- for tool_call in msg.tool_calls %}
+                {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%}
+                    {{ counter.value }}
+                    {%- set tool_call_id_seen.value = true %}
+                {%- endif %}
+                {%- set counter.value = counter.value + 1 %}
+            {%- endfor %}
+        {%- endif %}
+    {%- endfor %}
+{%- else -%}
+    {{ tool_call_id }}
+{%- endif -%}
+{%- endmacro %}
+{%- macro format_tool_message(messages, tool_msg) -%}
+{#- format tool message #}{
+        "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}",
+        "results": {
+        {%- if tool_msg.content is mapping or tool_msg.content is string %}
+
+            {% if tool_msg.content is string -%}
+                {%- set text_wrapper = {"content": tool_msg.content} -%}
+            {%- else -%}
+                {%- set text_wrapper = tool_msg.content -%}
+            {%- endif %}
+            "0": {{ text_wrapper|tojson }}
+        {%- else %}
+            {%- for content in tool_msg.content %}
+
+            "{{ loop.index0 }}": {{ print_tool_content(content) }}{% if not loop.last %},{% endif %}
+            {%- endfor %}
+        {%- endif %}
+
+        },
+        "is_error": null
+    }
+{%- endmacro -%}
+{%- macro print_tool_content(item) %}
+{%- if item.type|lower == "text" -%}
+{%- set text_wrapper = {"content": item.text} -%}
+{{ text_wrapper|tojson }}
+{%- elif item.type|lower == "document" and item.document and "data" in item.document -%}
+{{ item.document.data|tojson }}
+{%- else -%}
+{{ item|tojson }}
+{%- endif -%}
+{%- endmacro %}
+{%- macro print_msg(msg) %}
+    {%- if msg is string -%}
+<|START_TEXT|>{{ msg }}<|END_TEXT|>
+    {%- elif msg.content is string -%}
+<|START_TEXT|>{{ msg.content }}<|END_TEXT|>
+    {%- else %}
+        {%- set last_was_text = namespace(value=false) %}
+        {%- for content in msg.content %}
+            {%- if content.type|lower == "text" -%}
+                {%- if not last_was_text.value -%}
+                    <|START_TEXT|>
+                {%- endif -%}
+    {{ content.text }}
+                {%- if loop.last -%}
+                  <|END_TEXT|>
+                {%- endif %}
+                {%- set last_was_text.value = true -%}
+            {%- else -%}
+                {%- if last_was_text.value -%}
+                    <|END_TEXT|>
+                {%- endif -%}
+                {%- set last_was_text.value = false -%}
+            {%- endif -%}
+            {%- if content.type|lower == "image" -%}
+                {%- if content.data -%}
+{{ content.data }}
+                {%- else -%}
+<|IMG_PATCH|>
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor %}
+    {%- endif %}
+{%- endmacro %}
+{%- macro print_thinking(msg) %}
+    {%- if msg.reasoning -%}
+{{ msg.reasoning }}
+    {%- elif msg.reasoning_content -%}
+{{ msg.reasoning_content }}
+    {%- elif msg.thinking -%}
+{{ msg.thinking }}
+    {%- elif msg.content and msg.content[0].thinking -%}
+{{ msg.content[0].thinking }}
+    {%- endif %}
+{%- endmacro %}
+{%- if messages and messages[0]['role']|lower == 'system' and not has_developer_instruction and convert_first_system_msg %}{%- set developer_instruction = messages[0] %}{%- set render_developer_instruction = true %}{%- set initial_instruction_message = true %}{% endif %}
+{%- set json_object = true if response_format and response_format.type == "json_object" else false %}
+{%- set json_schema = (response_format.json_schema or response_format.schema) if response_format %}
+{%- set json_mode = json_object or json_schema %}
+{%- set tool_idx = namespace(value=0) %}
+{%- set tool_ids_seen = namespace(value=[]) %}
+{%- set regen_tool_call_ids = regen_tool_call_ids | default(true) -%}
+{%- set sent_documents = namespace(value=false) -%}
+
+{%- if render_tools_section or render_platform_instruction_override or render_grounding or json_mode -%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TEXT|>
+{%- elif not render_developer_instruction -%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+{%- endif %}
+
+{%- set rendered_platform_turn_chunk = false %}
+
+{%- if render_platform_instruction_override -%}
+{{ platform_instruction_override }}
+{% set rendered_platform_turn_chunk = true %}
+{%- else %}
+{%- endif %}
+
+{%- if render_grounding -%}
+{%- if rendered_platform_turn_chunk %}
+
+{% endif -%}
+Note that both your responses and reflections can be grounded. Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "<co>" and "</co>" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "<co>span</co: 0:[1,2],1:[0]>" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1".
+{% set rendered_platform_turn_chunk = true %}
+{%- endif %}
+
+{%- if render_tools_section %}
+{%- if rendered_platform_turn_chunk %}
+
+{% endif %}
+# Available Tools
+```json
+[
+{% if tools_or_docs_exist %}
+{%- if documents %}
+    {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}
+    {%- if tools %},
+    {% else %}
+
+    {% endif %}
+{%- endif %}
+{%- for tool in tools %}
+    {"name": "{{ tool['function']['name'] }}", "description": "{{ tool['function']['description'] }}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}
+    {%- if not loop.last %},{% endif %}
+
+{% endfor %}
+{%- else %}
+
+{% endif %}
+]
+```
+{%- set rendered_platform_turn_chunk = true %}
+{%- endif -%}
+
+{%- if json_mode -%}
+{%- if rendered_platform_turn_chunk %}
+
+
+{% endif -%}
+When generating JSON objects, do not generate block markers. Generate an object directly without prefixing with ```json. Return only the JSON and nothing else.
+    {%- if json_schema %}
+
+Your output should adhere to the following json schema:
+{{ json_schema }}
+    {%- endif -%}
+{%- set rendered_platform_turn_chunk = true %}
+{%- endif %}
+{%- if rendered_platform_turn_chunk -%}
+<|END_TEXT|><|END_OF_TURN_TOKEN|>
+{%- elif not render_developer_instruction -%}
+<|END_OF_TURN_TOKEN|>
+{%- endif %}
+{%- if render_developer_instruction -%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ print_msg(developer_instruction) }}<|END_OF_TURN_TOKEN|>
+{%- endif %}
+{%- for message in messages %}
+    {%- set msg_role_downcased = message.role | lower %}
+    {%- if msg_role_downcased == 'system' and (not (loop.first and initial_instruction_message)) -%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ print_msg(message) }}<|END_OF_TURN_TOKEN|>
+    {%- elif msg_role_downcased == 'user' -%}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ print_msg(message) }}<|END_OF_TURN_TOKEN|>
+        {%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %}
+    {%- elif msg_role_downcased == 'assistant' or msg_role_downcased == 'chatbot' -%}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+        {%- if message.tool_calls %}
+            {% if not skip_thinking %}
+                {% if message.tool_plan -%}
+                    <|START_THINKING|>{{ message.tool_plan }}<|END_THINKING|>
+                {%- elif message.reasoning or message.reasoning_content or message.thinking or (message.content and message.content[0].type == "thinking") -%}
+                    <|START_THINKING|>{{ print_thinking(message) }}<|END_THINKING|>
+                {%- endif %}
+            {%- endif %}<|START_ACTION|>[
+            {%- for tc in message.tool_calls %}
+
+    {"tool_call_id": "{%- if regen_tool_call_ids -%}{{ tool_idx.value }}{%- else -%}{{ tc.id }}{%- endif -%}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %}
+                {%- set tool_idx.value = tool_idx.value + 1 %}
+            {%- endfor %}
+
+]<|END_ACTION|><|END_OF_TURN_TOKEN|>
+        {%- else -%}
+            {% if (message.reasoning or message.reasoning_content or message.thinking or (message.content and message.content[0].type == "thinking")) and not skip_thinking -%}
+                <|START_THINKING|>{{ print_thinking(message) }}<|END_THINKING|>
+            {%- endif -%}
+            {{ print_msg(message) }}<|END_OF_TURN_TOKEN|>
+        {%- endif %}
+    {%- elif msg_role_downcased == 'tool' and message.tool_call_id not in tool_ids_seen.value -%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+    {{ format_tool_message(messages, message) }}
+        {%- for msg in messages[loop.index0 + 1:] %}
+
+            {%- if msg.role | lower == 'tool' %},
+    {{ format_tool_message(messages, msg) }}
+                {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %}
+            {%- else %}
+                {%- break %}
+            {%- endif %}
+        {%- endfor %}
+
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>
+    {%- endif %}
+{%- endfor %}{%- if add_generation_prompt -%}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if reasoning %}<|START_THINKING|>{% else %}<|START_THINKING|><|END_THINKING|>{% endif %}{%- endif %}
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -66,6 +66,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_XVERSE,           "xverse"           },
    { LLM_ARCH_COMMAND_R,        "command-r"        },
    { LLM_ARCH_COHERE2,          "cohere2"          },
+    { LLM_ARCH_COHERE2MOE,       "cohere2moe"       },
    { LLM_ARCH_DBRX,             "dbrx"             },
    { LLM_ARCH_OLMO,             "olmo"             },
    { LLM_ARCH_OLMO2,            "olmo2"            },
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -71,6 +71,7 @@ enum llm_arch {
    LLM_ARCH_XVERSE,
    LLM_ARCH_COMMAND_R,
    LLM_ARCH_COHERE2,
+    LLM_ARCH_COHERE2MOE,
    LLM_ARCH_DBRX,
    LLM_ARCH_OLMO,
    LLM_ARCH_OLMO2,
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -18,6 +18,7 @@ bool llama_model_saver_supports_arch(llm_arch arch) {
        case LLM_ARCH_GEMMA3:
        case LLM_ARCH_GEMMA3N:
        case LLM_ARCH_COHERE2:
+        case LLM_ARCH_COHERE2MOE:
        case LLM_ARCH_OLMO2:
        case LLM_ARCH_BITNET:
        case LLM_ARCH_T5:
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -157,6 +157,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
            return new llama_model_command_r(params);
        case LLM_ARCH_COHERE2:
            return new llama_model_cohere2(params);
+        case LLM_ARCH_COHERE2MOE:
+            return new llama_model_cohere2moe(params);
        case LLM_ARCH_DBRX:
            return new llama_model_dbrx(params);
        case LLM_ARCH_OLMO:
@@ -1467,9 +1469,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
    }
    ml.done_getting_tensors();

+    // Tied NVFP4 output is valid when no separate LM-head scale tensors are present.
+    // If sidecar scales exist, the output weight must be an actual output tensor.
    GGML_ASSERT(!(output && tok_embd &&
            strcmp(output->name, tok_embd->name) == 0 &&
-            output->type == GGML_TYPE_NVFP4));
+            output->type == GGML_TYPE_NVFP4 &&
+            (output_s || output_in_s)));
    // populate tensors_by_name
    for (auto & [_, ctx_ptr] : ml.ctx_map) {
        for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
@@ -1844,6 +1849,7 @@ void llama_model::print_info() const {
        }

        if (arch == LLM_ARCH_MELLUM ||
+                arch == LLM_ARCH_COHERE2MOE ||
                arch == LLM_ARCH_QWEN3MOE ||
                arch == LLM_ARCH_OPENAI_MOE ||
                arch == LLM_ARCH_QWEN3VLMOE ||
@@ -2389,6 +2395,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_XVERSE:
        case LLM_ARCH_COMMAND_R:
        case LLM_ARCH_COHERE2:
+        case LLM_ARCH_COHERE2MOE:
        case LLM_ARCH_OLMO:
        case LLM_ARCH_ARCTIC:
        case LLM_ARCH_DEEPSEEK:
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2280,7 +2280,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                clean_spaces = false;
                ignore_merges = true;
            } else if (
-                tokenizer_pre == "tiny_aya") {
+                tokenizer_pre == "tiny_aya" ||
+                tokenizer_pre == "cohere2moe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA;
                clean_spaces = false;
            } else if (
--- a/src/models/cohere2.cpp
+++ b/src/models/cohere2.cpp
@@ -122,9 +122,9 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par
        // feed-forward network
        {
            cur = build_ffn(ffn_inp,
-                    model.layers[il].ffn_up, NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
+                    model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
+                    model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
+                    model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
                    NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
            cb(cur, "ffn_out", il);
        }
--- a/src/models/cohere2moe.cpp
+++ b/src/models/cohere2moe.cpp
@@ -0,0 +1,443 @@
+#include "models.h"
+
+void llama_model_cohere2moe::load_arch_hparams(llama_model_loader & ml) {
+    const bool found_norm     = ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,     hparams.f_norm_eps,     false);
+    const bool found_norm_rms = ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+    if (!found_norm && !found_norm_rms) {
+        throw std::runtime_error("missing Cohere2 MoE norm epsilon");
+    }
+    if (!found_norm_rms) {
+        hparams.f_norm_rms_eps = 0.0f;
+    }
+
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
+    ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer");
+
+    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+    }
+
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    uint32_t swa_period = 4;
+    if (ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false)) {
+        hparams.set_swa_pattern(swa_period, true);
+    } else {
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());
+    }
+
+    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+
+    switch (hparams.n_layer()) {
+        case 49: type = LLM_TYPE_30B_A3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_cohere2moe::load_arch_tensors(llama_model_loader & ml) {
+    LLAMA_LOAD_LOCALS;
+
+    const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr);
+    // Trunk-only: the GGUF declares MTP layers in metadata but the actual MTP
+    // tensors live in a separate file. Mark MTP tensors NOT_REQUIRED so the
+    // trunk loads cleanly.
+    const std::string mtp_probe = "blk." + std::to_string(n_layer) + ".nextn.eh_proj.weight";
+    const bool trunk_only = (hparams.n_layer_nextn > 0) && (ml.get_weight(mtp_probe.c_str()) == nullptr);
+    const int trunk_flags = mtp_only  ? TENSOR_NOT_REQUIRED : 0;
+    const int mtp_flags   = trunk_only ? TENSOR_NOT_REQUIRED : 0;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+    }
+
+    if (n_expert == 0) {
+        throw std::runtime_error("n_expert must be > 0 for Cohere2Moe");
+    }
+    if (n_expert_used == 0) {
+        throw std::runtime_error("n_expert_used must be > 0 for Cohere2Moe");
+    }
+
+    auto load_block_trunk = [&](int i, int flags) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, flags);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
+
+        if (static_cast<uint32_t>(i) < hparams.n_layer_dense_lead) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), { n_embd, n_ff }, flags);
+        } else {
+            const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff;
+
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, flags);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
+            create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, flags);
+
+            if (hparams.n_expert_shared > 0) {
+                const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp * hparams.n_expert_shared;
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), { n_embd, n_ff_shexp }, flags);
+            }
+        }
+    };
+
+    auto load_block_mtp = [&](int i, int flags) {
+        auto & layer = layers[i];
+
+        // MTP block looks like a full-attention Cohere2 MoE decoder block.
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, flags);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
+
+        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff;
+
+        // Routed experts
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, flags);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
+        create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, flags);
+
+        if (hparams.n_expert_shared > 0) {
+            const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp * hparams.n_expert_shared;
+
+            // Shared experts
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), { n_embd, n_ff_shexp }, flags);
+        }
+
+        // NextN-specific tensors that define the MTP block.
+        layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", i), { 2 * n_embd, n_embd }, flags);
+        layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", i), { n_embd },              flags);
+        layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", i), { n_embd },              flags);
+        layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
+        layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
+        layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
+    };
+
+    for (int i = 0; i < n_layer; ++i) {
+        load_block_trunk(i, trunk_flags);
+    }
+    // MTP/NextN layers are loaded as extra decoder blocks.
+    for (int i = n_layer; i < n_layer_all; ++i) {
+        load_block_mtp(i, mtp_flags);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_cohere2moe::build_arch_graph(const llm_graph_params & params) const {
+    if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) {
+        return std::make_unique<graph_mtp>(*this, params);
+    }
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_cohere2moe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
+
+    const llm_norm_type cohere2moe_norm_type = hparams.f_norm_rms_eps == 0.0f ? LLM_NORM : LLM_NORM_RMS;
+    const float f_logit_scale = hparams.f_logit_scale;
+    ggml_tensor * cur;
+    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv_iswa();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
+    for (int il = 0; il < n_layer; ++il) {
+        const bool is_swa = hparams.is_swa(il);
+        // Dense-prefix full-attention layers use RoPE; later layers follow the SWA pattern.
+        const bool force_rope = static_cast<uint32_t>(il) < hparams.n_layer_dense_lead;
+
+        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, cohere2moe_norm_type, il);
+        cb(cur, "attn_norm", il);
+
+        ggml_tensor * ffn_inp = cur;
+
+        {
+            const auto & layer = model.layers[il];
+
+            auto [Qcur, Kcur, Vcur] = build_qkv(layer, cur,
+                    n_embd_head, n_head, n_head_kv, il);
+
+            if (is_swa || force_rope) {
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+            }
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(inp_attn,
+                    layer.wo, layer.wo_b, layer.wo_s,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
+                    1.0f / sqrtf(float(n_embd_head)), il);
+        }
+
+        if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
+            cur     = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpL    = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+        }
+
+        ggml_tensor * attn_out = cur;
+
+        const auto & layer = model.layers[il];
+
+        if (layer.ffn_gate_inp == nullptr) {
+            cur = build_ffn(ffn_inp,
+                    layer.ffn_up,   nullptr, layer.ffn_up_s,
+                    layer.ffn_gate, nullptr, layer.ffn_gate_s,
+                    layer.ffn_down, nullptr, layer.ffn_down_s,
+                    nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
+            cur = build_moe_ffn(ffn_inp,
+                    layer.ffn_gate_inp,
+                    layer.ffn_up_exps,
+                    layer.ffn_gate_exps,
+                    layer.ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, hparams.expert_weights_norm,
+                    hparams.expert_weights_scale,
+                    (llama_expert_gating_func_type) hparams.expert_gating_func,
+                    il,
+                    nullptr, layer.ffn_gate_up_exps,
+                    layer.ffn_up_exps_s,
+                    layer.ffn_gate_exps_s,
+                    layer.ffn_down_exps_s);
+            cb(cur, "ffn_moe_out", il);
+
+            if (layer.ffn_up_shexp) {
+                ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
+                        layer.ffn_up_shexp,   nullptr, layer.ffn_up_shexp_s,
+                        layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s,
+                        layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s,
+                        nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(ffn_shexp, "ffn_shexp", il);
+
+                cur = ggml_add(ctx0, cur, ffn_shexp);
+                cur = ggml_scale(ctx0, cur, 0.5f);
+                cb(cur, "ffn_out", il);
+            }
+        }
+
+        cur = ggml_add(ctx0, cur, inpL);
+        cur = ggml_add(ctx0, cur, attn_out);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        inpL = cur;
+    }
+
+    cur = inpL;
+    cur = build_norm(cur, model.output_norm, nullptr, cohere2moe_norm_type, -1);
+
+    cb(cur, "h_nextn", -1);
+    res->t_h_nextn = cur;
+
+    if (!cparams.embeddings_nextn_masked && inp_out_ids) {
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    }
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    cur = build_lora_mm(model.output, cur);
+
+    if (f_logit_scale) {
+        cur = ggml_scale(ctx0, cur, f_logit_scale);
+    }
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
+
+llama_model_cohere2moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    GGML_ASSERT(hparams.n_layer_nextn > 0 && "COHERE2MOE MTP requires n_layer_nextn > 0");
+    GGML_ASSERT(hparams.n_layer_nextn == 1 && "COHERE2MOE MTP currently only supports a single MTP block");
+
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
+
+    const int il = hparams.n_layer();
+    const auto & layer = model.layers[il];
+    GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
+    GGML_ASSERT(layer.nextn.enorm   && "MTP block missing nextn.enorm");
+    GGML_ASSERT(layer.nextn.hnorm   && "MTP block missing nextn.hnorm");
+    GGML_ASSERT(layer.ffn_gate_inp  && "MTP block missing ffn_gate_inp");
+
+    const llm_norm_type cohere2moe_norm_type = hparams.f_norm_rms_eps == 0.0f ? LLM_NORM : LLM_NORM_RMS;
+
+    // TODO: extract in a common llm_graph_context::build_inp_embd_h()
+    auto inp = std::make_unique<llm_graph_input_embd_h>(hparams.n_embd);
+
+    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp->tokens);
+
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens);
+    ggml_set_input(inp->embd);
+
+    // TODO: make static using `ggml_build_forward_select()`
+    //       see llm_graph_context::build_inp_embd() for reference
+    ggml_tensor * tok_embd;
+    if (ubatch.token) {
+        ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
+        tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+    } else {
+        tok_embd = inp->embd;
+    }
+    cb(tok_embd, "mtp_tok_embd", il);
+
+    inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    ggml_set_input(inp->h);
+    ggml_set_name(inp->h, "mtp_h_input");
+
+    ggml_tensor * h_embd = inp->h;
+
+    res->add_input(std::move(inp));
+
+    ggml_tensor * inp_pos     = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    auto * inp_attn = build_attn_inp_kv_iswa();
+
+    ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, cohere2moe_norm_type, il);
+    cb(h_norm, "mtp_hnorm", il);
+
+    ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, cohere2moe_norm_type, il);
+    cb(e_norm, "mtp_enorm", il);
+
+    ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
+    cb(concat, "mtp_concat", il);
+
+    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s);
+    cb(cur, "mtp_eh_proj", il);
+
+    ggml_tensor * inpL = cur;
+
+    cur = build_norm(cur, layer.attn_norm, nullptr, cohere2moe_norm_type, il);
+    cb(cur, "mtp_attn_norm", il);
+    ggml_tensor * ffn_inp = cur;
+
+    auto [Qcur, Kcur, Vcur] = build_qkv(layer, cur, n_embd_head, n_head, n_head_kv, il);
+    ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+    Qcur = ggml_rope_ext(
+            ctx0, Qcur, inp_pos, rope_factors,
+            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow);
+    Kcur = ggml_rope_ext(
+            ctx0, Kcur, inp_pos, rope_factors,
+            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow);
+
+    cb(Qcur, "mtp_Qcur", il);
+    cb(Kcur, "mtp_Kcur", il);
+    cb(Vcur, "mtp_Vcur", il);
+
+    cur = build_attn(inp_attn,
+            layer.wo, layer.wo_b, layer.wo_s,
+            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
+            1.0f / sqrtf(float(n_embd_head)), il);
+    cb(cur, "mtp_attn_out", il);
+
+    ggml_tensor * attn_out = cur;
+
+    cur = build_moe_ffn(ffn_inp,
+            layer.ffn_gate_inp,
+            layer.ffn_up_exps,
+            layer.ffn_gate_exps,
+            layer.ffn_down_exps,
+            nullptr,
+            n_expert, n_expert_used,
+            LLM_FFN_SILU, hparams.expert_weights_norm,
+            hparams.expert_weights_scale,
+            (llama_expert_gating_func_type) hparams.expert_gating_func,
+            il,
+            nullptr, layer.ffn_gate_up_exps,
+            layer.ffn_up_exps_s,
+            layer.ffn_gate_exps_s,
+            layer.ffn_down_exps_s);
+    cb(cur, "mtp_ffn_moe_out", il);
+
+    if (layer.ffn_up_shexp) {
+        ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
+                layer.ffn_up_shexp,   nullptr, layer.ffn_up_shexp_s,
+                layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s,
+                layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s,
+                nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(ffn_shexp, "mtp_ffn_shexp", il);
+
+        cur = ggml_add(ctx0, cur, ffn_shexp);
+        cur = ggml_scale(ctx0, cur, 0.5f);
+        cb(cur, "mtp_ffn_out", il);
+    }
+
+    cur = ggml_add(ctx0, cur, inpL);
+    cur = ggml_add(ctx0, cur, attn_out);
+    cb(cur, "mtp_post_ffn", il);
+
+    ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
+            ? layer.nextn.shared_head_norm
+            : model.output_norm;
+    GGML_ASSERT(head_norm_w && "COHERE2MOE MTP: missing both nextn.shared_head_norm and output_norm");
+    cur = build_norm(cur, head_norm_w, nullptr, cohere2moe_norm_type, -1);
+
+    cb(cur, "h_nextn", -1);
+    res->t_h_nextn = cur;
+
+    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    cb(cur, "mtp_shared_head_norm", -1);
+
+    ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    GGML_ASSERT(head_w && "COHERE2MOE MTP: missing LM head (nextn.shared_head_head or model.output)");
+    cur = build_lora_mm(head_w, cur, layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : nullptr);
+
+    if (hparams.f_logit_scale) {
+        cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
+    }
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -937,6 +937,23 @@ struct llama_model_cohere2 : public llama_model_base {
 };


+struct llama_model_cohere2moe : public llama_model_base {
+    llama_model_cohere2moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    struct graph_mtp : public llm_graph_context {
+        graph_mtp(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
 struct llama_model_dbrx : public llama_model_base {
    llama_model_dbrx(const struct llama_model_params & params) : llama_model_base(params) {}
    void load_arch_hparams(llama_model_loader & ml) override;
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -2644,6 +2644,100 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .run();
    }

+    {
+        // Cohere2 MoE (North Code) - dedicated parser.
+        // Marker-wrapped format: <|START_THINKING|>...<|END_THINKING|> then either
+        // <|START_TEXT|>...<|END_TEXT|> (content) or <|START_ACTION|>[json]<|END_ACTION|> (tools).
+        // The generation prompt forces a leading <|START_THINKING|>, so model output begins inside
+        // the thinking block: test inputs start with the reasoning body, not the <|START_THINKING|> tag.
+        auto tst = peg_tester("models/templates/Cohere2MoE.jinja", detailed_debug);
+
+        // Content with reasoning, extracted.
+        tst.test("I'm\nthinking<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .expect(message_assist_thoughts)
+            .run();
+
+        // Content with reasoning, reasoning_format=NONE -> thinking kept inline in content (markers preserved).
+        tst.test("I'm\nthinking<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>")
+            .expect(message_assist_thoughts_unparsed_r7b)
+            .run();
+
+        // Content with empty thinking block.
+        tst.test("<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .expect(message_assist)
+            .run();
+
+        // Single tool call with reasoning.
+        tst.test(
+               "I'm\nthinking<|END_THINKING|>"
+               "<|START_ACTION|>[\n"
+               "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+               "]<|END_ACTION|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .tools({ special_function_tool })
+            .expect(message_assist_thoughts_call_idx)
+            .run();
+
+        // Single tool call, empty thinking block (no reasoning content).
+        tst.test(
+               "<|END_THINKING|>"
+               "<|START_ACTION|>[\n"
+               "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+               "]<|END_ACTION|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .tools({ special_function_tool })
+            .expect(message_assist_call_idx)
+            .run();
+
+        // Tool call with an array argument (todo_list).
+        tst.test(
+               "<|END_THINKING|>"
+               "<|START_ACTION|>[\n"
+               "    {\"tool_call_id\": \"0\", \"tool_name\": \"todo_list\", \"parameters\": {\"todos\": [\"buy milk\", \"walk dog\"]}}\n"
+               "]<|END_ACTION|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .tools({ todo_list })
+            .expect(simple_assist_msg("", "", "todo_list", "{\"todos\": [\"buy milk\", \"walk dog\"]}", "0"))
+            .run();
+
+        // Parallel tool calls with reasoning.
+        tst.test(
+               "I'm\nthinking<|END_THINKING|>"
+               "<|START_ACTION|>[\n"
+               "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}},\n"
+               "    {\"tool_call_id\": \"1\", \"tool_name\": \"python\", \"parameters\": {\"code\": \"print('hey')\"}}\n"
+               "]<|END_ACTION|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .parallel_tool_calls(true)
+            .tools({ special_function_tool, python_tool })
+            .expect_reasoning("I'm\nthinking")
+            .expect_tool_calls({
+                { "special_function", R"({"arg1": 1})", "0" },
+                { "python", "{\"code\": \"print('hey')\"}", "1" },
+            })
+            .run();
+
+        // Tools available but the model answers with content instead of calling a tool.
+        tst.test("I'm\nthinking<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .tools({ special_function_tool })
+            .expect(message_assist_thoughts)
+            .run();
+
+        // Partial tool call (streaming): name/id resolved before arguments arrive.
+        tst.test(
+               "I'm\nthinking<|END_THINKING|>"
+               "<|START_ACTION|>[\n"
+               "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", ")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
+            .tools({ special_function_tool })
+            .is_partial(true)
+            .expect(message_assist_thoughts_partial_call)
+            .run();
+    }
+
    {
        // Google Gemma 2 2B - does not support tool calling
        auto tst = peg_tester("models/templates/google-gemma-2-2b-it.jinja");
--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
@@ -435,6 +435,24 @@ static void test_expressions(testing & t) {
        "('c', 'b', 'a')"
    );

+    test_template(t, "string slice negative step",
+        "{{ 'abcdef'[::-2] }}",
+        json::object(),
+        "fdb"
+    );
+
+    test_template(t, "string slice negative start and step",
+        "{{ 'abcdef'[-1:1:-1] }}",
+        json::object(),
+        "fedc"
+    );
+
+    test_template(t, "string slice negative start, stop and step",
+        "{{ 'abcdef'[-1:-5:-1] }}",
+        json::object(),
+        "fedc"
+    );
+
    test_template(t, "arithmetic",
        "{{ (a + b) * c }}",
        {{"a", 2}, {"b", 3}, {"c", 4}},
@@ -583,8 +601,8 @@ static void test_filters(testing & t) {
        "hello jinja"
    );

-    test_template(t, "length list",
-        "{{ items|length }}",
+    test_template(t, "length (count alias) list",
+        "{{ items|count }}",
        {{"items", json::array({1, 2, 3})}},
        "3"
    );
@@ -693,8 +711,8 @@ static void test_filters(testing & t) {
        "fallback"
    );

-    test_template(t, "default with falsy value",
-        "{{ ''|default('fallback', true) }}",
+    test_template(t, "default (d alias) with falsy value",
+        "{{ ''|d('fallback', true) }}",
        json::object(),
        "fallback"
    );
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -185,7 +185,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
        ms.add_kv(LLM_KV_ROPE_FREQ_BASE_SWA,              10000.0f);
        // SWA pattern: every 5th layer is full attention (matches E2B layer_types)
        ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(5));
-    } else if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
+    } else if (arch == LLM_ARCH_COHERE2MOE || arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
        std::vector<uint32_t> pattern;
        pattern.reserve(n_layer);
        for (uint32_t il = 0; il < n_layer; il++) {
@@ -322,6 +322,7 @@ static std::vector<float> get_logits(
 static bool moe_mandatory(const llm_arch arch) {
    switch (arch) {
        case LLM_ARCH_LLAMA4:
+        case LLM_ARCH_COHERE2MOE:
        case LLM_ARCH_GROK:
        case LLM_ARCH_QWEN2MOE:
        case LLM_ARCH_QWEN3MOE:
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -97,11 +97,18 @@ struct cli_context {
                task.params.chat_parser_params.parser.load(chat_params.parser);
            }

+            // Copy the preserved tokens into the sampling params
+            const llama_vocab * vocab = llama_model_get_vocab(
+                llama_get_model(ctx_server.get_llama_context()));
+            for (const auto & token : chat_params.preserved_tokens) {
+                auto ids = common_tokenize(vocab, token, false, true);
+                if (ids.size() == 1) {
+                    task.params.sampling.preserved_tokens.insert(ids[0]);
+                }
+            }
+
            // reasoning budget sampler
            if (!chat_params.thinking_end_tag.empty()) {
-                const llama_vocab * vocab = llama_model_get_vocab(
-                    llama_get_model(ctx_server.get_llama_context()));
-
                task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens;
                task.params.sampling.generation_prompt = chat_params.generation_prompt;

--- a/tools/ui/embed.cpp
+++ b/tools/ui/embed.cpp
@@ -1,7 +1,7 @@
 // llama-ui-embed: generate ui.cpp / ui.h that embed UI assets as C arrays.
 //
 // Usage:
-//   llama-ui-embed <out_cpp> <out_h> <asset_dir>
+//   llama-ui-embed <out_cpp> <out_h> [<asset_dir>]
 //
 // Recursively embeds every regular file under <asset_dir>.
 // Asset names are relative paths from <asset_dir> (e.g. "_app/immutable/bundle.HASH.js").
@@ -147,9 +147,9 @@ int main(int argc, char ** argv) {

    const std::string out_cpp   = argv[1];
    const std::string out_h     = argv[2];
-    const std::string asset_dir = argv[3];
+    const std::string asset_dir = (argc >= 4) ? argv[3] : std::string();

-    const bool        use_gzip = std::filesystem::exists(asset_dir + "/_gzip");
+    const bool        use_gzip = !asset_dir.empty() && std::filesystem::exists(asset_dir + "/_gzip");
    const std::string in_dir   = use_gzip ? (asset_dir + "/_gzip") : asset_dir;

    std::vector<asset_entry> assets;
--- a/tools/ui/src/app.html
+++ b/tools/ui/src/app.html
@@ -9,7 +9,10 @@

 		<link rel="manifest" href="./manifest.webmanifest" />

-		<meta name="viewport" content="width=device-width, initial-scale=1" />
+		<meta
+			name="viewport"
+			content="width=device-width, initial-scale=1, interactive-widget=resizes-content"
+		/>
 		%sveltekit.head%
 	</head>

--- a/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte
@@ -56,6 +56,7 @@

 	const showToolCallInProgress = $derived(config().showToolCallInProgress as boolean);
 	const showThoughtInProgress = $derived(config().showThoughtInProgress as boolean);
+	const renderThinkingAsMarkdown = $derived(config().renderThinkingAsMarkdown as boolean);

 	const hasReasoningError = $derived(
 		isLastAssistantMessage ? !!agenticLastError(message.convId) : false
@@ -316,9 +317,13 @@
 			onToggle={() => toggleExpanded(index, section)}
 		>
 			<div class="pt-3">
-				<div class="text-xs leading-relaxed break-words whitespace-pre-wrap">
-					{section.content}
-				</div>
+				{#if renderThinkingAsMarkdown}
+					<MarkdownContent content={section.content} attachments={message?.extra} />
+				{:else}
+					<div class="text-xs leading-relaxed break-words whitespace-pre-wrap">
+						{section.content}
+					</div>
+				{/if}
 			</div>
 		</CollapsibleContentBlock>
 	{:else if section.type === AgenticSectionType.REASONING_PENDING}
@@ -336,9 +341,13 @@
 			onToggle={() => toggleExpanded(index, section)}
 		>
 			<div class="pt-3">
-				<div class="text-xs leading-relaxed break-words whitespace-pre-wrap">
-					{section.content}
-				</div>
+				{#if renderThinkingAsMarkdown}
+					<MarkdownContent content={section.content} attachments={message?.extra} />
+				{:else}
+					<div class="text-xs leading-relaxed break-words whitespace-pre-wrap">
+						{section.content}
+					</div>
+				{/if}
 			</div>
 		</CollapsibleContentBlock>
 	{/if}
--- a/tools/ui/src/lib/components/ui/sidebar/sidebar-provider.svelte
+++ b/tools/ui/src/lib/components/ui/sidebar/sidebar-provider.svelte
@@ -41,7 +41,7 @@
 	data-slot="sidebar-wrapper"
 	style="--sidebar-width: {sidebar.sidebarWidth}; --sidebar-min-width: {SIDEBAR_MIN_WIDTH}; --sidebar-max-width: {SIDEBAR_MAX_WIDTH}; --sidebar-width-icon: {SIDEBAR_WIDTH_ICON}; {style}"
 	class={cn(
-		'group/sidebar-wrapper flex min-h-svh w-full has-data-[variant=inset]:bg-sidebar',
+		'group/sidebar-wrapper flex flex-col h-dvh w-full has-data-[variant=inset]:bg-sidebar',
 		className
 	)}
 	bind:this={ref}
--- a/tools/ui/src/lib/constants/image-size.ts
+++ b/tools/ui/src/lib/constants/image-size.ts
@@ -1 +1,3 @@
 export const MEGAPIXELS_TO_PIXELS = 1_000_000;
+
+export const HEIC_JPEG_QUALITY = 0.85;
--- a/tools/ui/src/lib/constants/settings-keys.ts
+++ b/tools/ui/src/lib/constants/settings-keys.ts
@@ -33,6 +33,7 @@ export const SETTINGS_KEYS = {
 	SHOW_MODEL_TAGS: 'showModelTags',
 	SHOW_BUILD_VERSION: 'showBuildVersion',
 	SHOW_SYSTEM_MESSAGE: 'showSystemMessage',
+	RENDER_THINKING_AS_MARKDOWN: 'renderThinkingAsMarkdown',
 	// Sampling
 	TEMPERATURE: 'temperature',
 	DYNATEMP_RANGE: 'dynatemp_range',
--- a/tools/ui/src/lib/constants/settings-registry.ts
+++ b/tools/ui/src/lib/constants/settings-registry.ts
@@ -282,6 +282,18 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 					paramType: SyncableParameterType.BOOLEAN
 				}
 			},
+			{
+				key: SETTINGS_KEYS.RENDER_THINKING_AS_MARKDOWN,
+				label: 'Render thinking as Markdown',
+				help: 'Render the reasoning/thinking block content as formatted Markdown instead of plain text.',
+				defaultValue: true,
+				type: SettingsFieldType.CHECKBOX,
+				section: SETTINGS_SECTION_SLUGS.DISPLAY,
+				sync: {
+					serverKey: SETTINGS_KEYS.RENDER_THINKING_AS_MARKDOWN,
+					paramType: SyncableParameterType.BOOLEAN
+				}
+			},
 			{
 				key: SETTINGS_KEYS.FULL_HEIGHT_CODE_BLOCKS,
 				label: 'Use full height code blocks',
--- a/tools/ui/src/lib/constants/supported-file-types.ts
+++ b/tools/ui/src/lib/constants/supported-file-types.ts
@@ -63,6 +63,10 @@ export const IMAGE_FILE_TYPES = {
 	[FileTypeImage.SVG]: {
 		extensions: [FileExtensionImage.SVG],
 		mimeTypes: [MimeTypeImage.SVG]
+	},
+	[FileTypeImage.HEIC]: {
+		extensions: [FileExtensionImage.HEIC, FileExtensionImage.HEIF],
+		mimeTypes: [MimeTypeImage.HEIC, MimeTypeImage.HEIF]
 	}
 } as const;

--- a/tools/ui/src/lib/enums/files.enums.ts
+++ b/tools/ui/src/lib/enums/files.enums.ts
@@ -25,7 +25,9 @@ export enum FileTypeImage {
 	PNG = 'png',
 	GIF = 'gif',
 	WEBP = 'webp',
-	SVG = 'svg'
+	SVG = 'svg',
+	HEIC = 'heic',
+	HEIF = 'heif'
 }

 export enum FileTypeAudio {
@@ -90,7 +92,9 @@ export enum FileExtensionImage {
 	PNG = '.png',
 	GIF = '.gif',
 	WEBP = '.webp',
-	SVG = '.svg'
+	SVG = '.svg',
+	HEIC = '.heic',
+	HEIF = '.heif'
 }

 export enum FileExtensionAudio {
@@ -205,7 +209,9 @@ export enum MimeTypeImage {
 	WEBP = 'image/webp',
 	SVG = 'image/svg+xml',
 	ICO = 'image/x-icon',
-	ICO_MICROSOFT = 'image/vnd.microsoft.icon'
+	ICO_MICROSOFT = 'image/vnd.microsoft.icon',
+	HEIC = 'image/heic',
+	HEIF = 'image/heif'
 }

 export enum MimeTypeText {
--- a/tools/ui/src/lib/hooks/use-pwa.svelte.ts
+++ b/tools/ui/src/lib/hooks/use-pwa.svelte.ts
@@ -53,6 +53,8 @@ export function usePwa() {
 	// This comparison detects server upgrades for non-PWA users.
 	$effect(() => {
 		if (!browser) return;
+		// PWA pages update via the service worker path; the storage check is the non-PWA fallback only
+		if (navigator.serviceWorker?.controller) return;

 		const currentVersion = versionStore.value;
 		if (!currentVersion) return;
--- a/tools/ui/src/lib/utils/file-type.ts
+++ b/tools/ui/src/lib/utils/file-type.ts
@@ -30,6 +30,8 @@ export function getFileTypeCategory(mimeType: string): FileTypeCategory | null {
 		case MimeTypeImage.GIF:
 		case MimeTypeImage.WEBP:
 		case MimeTypeImage.SVG:
+		case MimeTypeImage.HEIC:
+		case MimeTypeImage.HEIF:
 			return FileTypeCategory.IMAGE;

 		// Audio
@@ -118,6 +120,8 @@ export function getFileTypeCategoryByExtension(filename: string): FileTypeCatego
 		case FileExtensionImage.GIF:
 		case FileExtensionImage.WEBP:
 		case FileExtensionImage.SVG:
+		case FileExtensionImage.HEIC:
+		case FileExtensionImage.HEIF:
 			return FileTypeCategory.IMAGE;

 		// Audio
--- a/tools/ui/src/lib/utils/heic-to-jpeg.ts
+++ b/tools/ui/src/lib/utils/heic-to-jpeg.ts
@@ -0,0 +1,56 @@
+import { MimeTypeImage } from '$lib/enums';
+import { HEIC_JPEG_QUALITY } from '$lib/constants/image-size';
+
+// heic requires a relatively large decoder, in order to reduce primary bundle size
+// we lazily load this decoder from a CDN when needed, and cache it for future conversions
+const HEIC_TO_CDN_URL = 'https://cdn.jsdelivr.net/npm/heic-to@1.5.2/dist/heic-to.js';
+
+interface HeicToModule {
+	heicTo(args: { blob: Blob; type: string; quality?: number }): Promise<Blob>;
+}
+
+let modulePromise: Promise<HeicToModule> | null = null;
+
+/**
+ * Lazily load the heic-to decoder from the CDN and cache it
+ * @returns Promise resolving to the heic-to module
+ */
+function getHeicTo(): Promise<HeicToModule> {
+	if (!modulePromise) {
+		modulePromise = import(/* @vite-ignore */ HEIC_TO_CDN_URL) as Promise<HeicToModule>;
+	}
+
+	return modulePromise;
+}
+
+/**
+ * Convert a HEIC/HEIF file to a compressed JPEG data URL
+ * @param file - The HEIC/HEIF file to convert
+ * @returns Promise resolving to JPEG data URL
+ */
+export async function heicFileToJpegDataURL(file: File | Blob): Promise<string> {
+	const { heicTo } = await getHeicTo();
+	const jpegBlob = await heicTo({
+		blob: file,
+		type: MimeTypeImage.JPEG,
+		quality: HEIC_JPEG_QUALITY
+	});
+
+	return new Promise((resolve, reject) => {
+		const reader = new FileReader();
+		reader.onload = () => resolve(reader.result as string);
+		reader.onerror = () => reject(reader.error);
+		reader.readAsDataURL(jpegBlob);
+	});
+}
+
+/**
+ * Check if a MIME type represents a HEIC/HEIF image
+ * @param mimeType - The MIME type to check
+ * @returns True if the MIME type is image/heic or image/heif
+ */
+export function isHeicMimeType(mimeType: string): boolean {
+	const normalized = mimeType.trim().toLowerCase();
+
+	return normalized === MimeTypeImage.HEIC || normalized === MimeTypeImage.HEIF;
+}
--- a/tools/ui/src/lib/utils/process-uploaded-files.ts
+++ b/tools/ui/src/lib/utils/process-uploaded-files.ts
@@ -1,5 +1,6 @@
 import { isSvgMimeType, svgBase64UrlToPngDataURL } from './svg-to-png';
 import { isWebpMimeType, webpBase64UrlToPngDataURL } from './webp-to-png';
+import { heicFileToJpegDataURL, isHeicMimeType } from './heic-to-jpeg';
 import { FileTypeCategory } from '$lib/enums';
 import { SETTINGS_KEYS } from '$lib/constants';
 import { modelsStore } from '$lib/stores/models.svelte';
@@ -68,7 +69,7 @@ export async function processFilesToChatUploaded(
 			if (getFileTypeCategory(file.type) === FileTypeCategory.IMAGE) {
 				let preview = await readFileAsDataURL(file);

-				// Normalize SVG and WebP to PNG in previews
+				// Normalize SVG and WebP to PNG, and HEIC to compressed JPEG, in previews
 				if (isSvgMimeType(file.type)) {
 					try {
 						preview = await svgBase64UrlToPngDataURL(preview);
@@ -81,6 +82,13 @@ export async function processFilesToChatUploaded(
 					} catch (err) {
 						console.error('Failed to convert WebP to PNG:', err);
 					}
+				} else if (isHeicMimeType(file.type)) {
+					try {
+						preview = await heicFileToJpegDataURL(file);
+					} catch (err) {
+						console.error('Failed to convert HEIC to PNG:', err);
+						continue;
+					}
 				}

 				results.push({ ...base, preview });
--- a/tools/ui/src/routes/+layout.svelte
+++ b/tools/ui/src/routes/+layout.svelte
@@ -312,7 +312,7 @@
 	/>

 	<Sidebar.Provider bind:open={sidebarOpen}>
-		<div class="flex h-screen w-full">
+		<div class="flex h-full w-full grow">
 			<Sidebar.Root variant="floating" class="h-full"
 				><SidebarNavigation bind:this={chatSidebar} /></Sidebar.Root
 			>
Author	SHA1	Message	Date
Masashi Yoshimura	6e9007ae61	ggml-webgpu: improve i-quants mul_mat performance and speed up prefill (#24530 ) * Improve prefill speeds for i-quants * Fix #if defined() usage in preprocessor guards.	2026-06-14 18:15:30 -07:00
Sigbjørn Skjæret	dd4623a74f	convert : fix lora base model arch retrieval (#24621 )	2026-06-15 00:55:26 +02:00
franitel	ef8268feee	fix(ui): render thinking/reasoning block content as markdown (#24611 ) * fix(ui): render thinking/reasoning block content as markdown * feat(ui): add toggle setting for thinking block markdown rendering	2026-06-14 22:56:56 +02:00
Nicolas Mowen	5f04dc7ac3	ui: Add HEIC/HEIF image support (#24137 ) * Add boilerplate for file types * Add heic-to and implement conversion * Load heic library from CDN * Use jpg instead of png for conversion * Move const to constants file	2026-06-14 20:42:16 +02:00
Piotr Wilkin (ilintar)	aedb2a5e9c	chat: add dedicated Cohere2MoE (North Code) parser (#24615 ) * chat: add dedicated Cohere2MoE (North Code) parser * Some renames to make @CISC happy :>	2026-06-14 20:17:40 +02:00
Mohammad Athar	8edaca9034	docs : fix typos in CUDA-FEDORA.md and grammars/README.md (#24459 )	2026-06-15 01:33:38 +08:00
Alexander Batischev	20c5266f8a	docker: specify registry to simplify Podman builds (#24607 )	2026-06-15 01:27:20 +08:00
Pascal	fd5869fb62	UI/mobile keyboard and pwa popup fixes (#24610 ) * ui: make mobile layout keyboard-aware via interactive-widget and dvh shell anchor * ui: fix duplicate PWA refresh popup by scoping the storage check to non-PWA pages	2026-06-14 18:35:00 +02:00
Amos Wong	1fd6dfe9f3	ui : fix ui clipping in mobile due to incorrect height setup (#24605 )	2026-06-14 16:15:51 +02:00
Sigbjørn Skjæret	acd79d603c	jinja : add count/d/e filter aliases (#24606 )	2026-06-14 15:07:31 +02:00
Michael Wand	6e14286eda	cli : fix not copying preserved tokens (#24258 )	2026-06-14 11:52:15 +02:00
Bartowski	8ed274ef46	Add cohere2moe to llama-vocab for TINY_AYA (#24601 )	2026-06-14 09:04:46 +02:00
Sigbjørn Skjæret	46722116b9	ci : use CUDA label for cuda backend (#24594 )	2026-06-14 08:27:52 +02:00
Sigbjørn Skjæret	c2ba3e47a2	add sycl to check-release (#24583 )	2026-06-14 09:42:26 +08:00
Aldehir Rojas	53bd47ea5b	ui : fix llama-ui-embed crash when no asset dir is given (#24597 )	2026-06-13 17:53:30 -05:00
Michael Wand	4988f6e866	Add arch support for cohere2-MoE (#24260 ) * Add arch support for cohere2-MoE * Removed redundant gating_func checks * Changed ffn lookup to prefer prefix_dense_intermediate_size * Renamed arch to cohere2moe * Removed redundant lmhead check and chat template changes * Removed lm_head.weight check from modify tensors, load output tensor not required, fallback to token_embd.weight * Changed to (routed+shared)0.5 for shared expert combined avg fixed sliding_window_pattern issue and pattern * Fixed transformers crash 'first_k_dense_replace' error * Remove comment * Removed cohere2-moe as a tokenizer type and kept as tiny_aya. Renamed North-Mini-Code-1.0. * Fixed MTP fail, changed to use iSWA * Fixed remaining todos: cohere2moe renamed, changed swa parsing to use get_key_or_arr, removed extra get_arr use * Force metadata usage Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Remove Cohere2 checkpoint comment Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Remove MTP comment Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Regenerate cohere2moe tokenizer hash * Add cohere2moe to Llama Model Saver supported list * Check for zerobios tensors and add support for Command to use LayerNorm * Map expert_selection_fn to sigmoid in base.py instead of command.py * use bools for foundnorm/foundnormrms Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-06-13 19:49:00 +02:00
Sigbjørn Skjæret	f05cf4676a	jinja : fix negative step slice with start/stop values (#24580 )	2026-06-13 18:28:40 +02:00