spec : fix vocab compatibility check (#24256 )

arg: Skip mmproj download when user supplied mmproj (#24239 )
convert : fix Gemma4 with no audio encoder (#24242 )
2026-06-08 03:53:00 +02:00 · 2026-06-07 14:43:52 +03:00 · 2026-06-07 11:18:44 +02:00 · 2026-06-07 08:43:05 +02:00 · 2026-06-07 08:31:58 +02:00 · 2026-06-06 22:39:21 +02:00
18 changed files with 473 additions and 270 deletions
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -82,8 +82,8 @@ jobs:
            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
+            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -444,7 +444,7 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
    opts.offline         = params.offline;
    opts.skip_download   = params.skip_download;
    opts.download_mtp    = spec_type_draft_mtp;
-    opts.download_mmproj = !params.no_mmproj;
+    opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();

    // sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
    // so we should not auto-discover mtp/mmproj siblings for them
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1625,8 +1625,17 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    const std::string THINK_END       = "</think>";
    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";

-    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
+    // Copy reasoning to the "thinking" field the template expects
+    auto adjusted_messages = json::array();
+    for (auto msg : inputs.messages) {
+        if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
+            msg["thinking"] = msg.at("reasoning_content");
+        }
+        adjusted_messages.push_back(msg);
+    }
+
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs, adjusted_messages);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, adjusted_messages);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
    data.preserved_tokens  = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END };
@@ -1639,7 +1648,9 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    data.thinking_end_tag   = THINK_END;

    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
-    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    // Gate by reasoning format and whether the template supports <think>
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
+                             tmpl.source().find(THINK_START) != std::string::npos;
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;

    if (inputs.has_continuation()) {
@@ -1658,7 +1669,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
        auto end = p.end();

        auto reasoning = p.eps();
-        if (extract_reasoning && inputs.enable_thinking) {
+        if (extract_reasoning) {
            reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
        }

--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -58,10 +58,10 @@ static bool common_speculative_are_compatible(
    const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
    const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);

-    const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
+    const auto vocab_type_tgt = llama_vocab_type(vocab_tgt);
    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);

-    const bool vocab_type_dft = llama_vocab_type(vocab_dft);
+    const auto vocab_type_dft = llama_vocab_type(vocab_dft);
    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
--- a/conversion/gemma.py
+++ b/conversion/gemma.py
@@ -812,10 +812,11 @@ class Gemma4VisionAudioModel(MmprojModel):
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))

        # audio params
-        assert self.hparams_audio is not None
-        self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
-        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
-        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
+        if self.has_audio_encoder:
+            assert self.hparams_audio is not None
+            self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
+            self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+            self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))

    def is_audio_tensor(self, name: str) -> bool:
        return "audio_tower" in name or "embed_audio" in name
--- a/models/templates/LFM2.5-8B-A1B.jinja
+++ b/models/templates/LFM2.5-8B-A1B.jinja
@@ -0,0 +1,115 @@
+{{- bos_token -}}
+{%- set preserve_thinking = preserve_thinking | default(false) -%}
+
+{%- macro format_arg_value(arg_value) -%}
+    {%- if arg_value is string -%}
+        {{- "'" + arg_value + "'" -}}
+    {%- elif arg_value is mapping -%}
+        {{- arg_value | tojson -}}
+    {%- else -%}
+        {{- arg_value | string -}}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro parse_content(content) -%}
+    {%- if content is string -%}
+        {{- content -}}
+    {%- else -%}
+        {%- set _ns = namespace(result="") -%}
+        {%- for item in content -%}
+            {%- if item["type"] == "image" -%}
+                {%- set _ns.result = _ns.result + "<image>" -%}
+            {%- elif item["type"] == "text" -%}
+                {%- set _ns.result = _ns.result + item["text"] -%}
+            {%- else -%}
+                {%- set _ns.result = _ns.result + item | tojson -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {{- _ns.result -}}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro render_tool_calls(tool_calls) -%}
+    {%- set tool_calls_ns = namespace(tool_calls=[]) -%}
+    {%- for tool_call in tool_calls -%}
+        {%- set func_name = tool_call["function"]["name"] -%}
+        {%- set func_args = tool_call["function"]["arguments"] -%}
+        {%- set args_ns = namespace(arg_strings=[]) -%}
+        {%- for arg_name, arg_value in func_args.items() -%}
+            {%- set args_ns.arg_strings = args_ns.arg_strings + [arg_name + "=" + format_arg_value(arg_value)] -%}
+        {%- endfor -%}
+        {%- set tool_calls_ns.tool_calls = tool_calls_ns.tool_calls + [func_name + "(" + (args_ns.arg_strings | join(", ")) + ")"] -%}
+    {%- endfor -%}
+    {{- "<|tool_call_start|>[" + (tool_calls_ns.tool_calls | join(", ")) + "]<|tool_call_end|>" -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(system_prompt="", last_user_index=-1) -%}
+{%- if messages[0]["role"] == "system" -%}
+    {%- if messages[0].get("content") -%}
+        {%- set ns.system_prompt = parse_content(messages[0]["content"]) -%}
+    {%- endif -%}
+    {%- set messages = messages[1:] -%}
+{%- endif -%}
+{%- if tools -%}
+    {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
+    {%- for tool in tools -%}
+        {%- if tool is not string -%}
+            {%- set tool = tool | tojson -%}
+        {%- endif -%}
+        {%- set ns.system_prompt = ns.system_prompt + tool -%}
+        {%- if not loop.last -%}
+            {%- set ns.system_prompt = ns.system_prompt + ", " -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {%- set ns.system_prompt = ns.system_prompt + "]" -%}
+{%- endif -%}
+{%- if ns.system_prompt -%}
+    {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
+{%- endif -%}
+{%- for message in messages -%}
+    {%- if message["role"] == "user" -%}
+        {%- set ns.last_user_index = loop.index0 -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- for message in messages -%}
+    {{- "<|im_start|>" + message.role + "\n" -}}
+    {%- if message.role == "assistant" -%}
+        {%- generation -%}
+        {%- if message.thinking is defined and (preserve_thinking or loop.index0 > ns.last_user_index) -%}
+            {{- "<think>" + message.thinking + "</think>" -}}
+        {%- endif -%}
+        {%- set _cfm_tag = "CONTINUE_FINAL_MESSAGE_TAG " -%}
+        {%- set _has_cfm = false -%}
+        {%- if message.content is defined -%}
+            {%- set content = parse_content(message.content) -%}
+            {%- if not (preserve_thinking or loop.index0 > ns.last_user_index) -%}
+                {%- if "</think>" in content -%}
+                    {%- set content = content.split("</think>")[-1] | trim -%}
+                {%- endif -%}
+            {%- endif -%}
+            {%- if message.tool_calls is defined and content.endswith(_cfm_tag) -%}
+                {%- set _has_cfm = true -%}
+                {%- set _trunc_len = (content | length) - (_cfm_tag | length) -%}
+                {{- content[:_trunc_len] -}}
+            {%- else -%}
+                {{- content -}}
+            {%- endif -%}
+        {%- endif -%}
+        {%- if message.tool_calls is defined -%}
+            {{- render_tool_calls(message.tool_calls) -}}
+        {%- endif -%}
+        {%- if _has_cfm -%}
+            {{- _cfm_tag -}}
+        {%- endif -%}
+        {{- "<|im_end|>\n" -}}
+        {%- endgeneration -%}
+    {%- else %}
+        {%- if message.get("content") -%}
+            {{- parse_content(message["content"]) -}}
+        {%- endif -%}
+        {{- "<|im_end|>\n" -}}
+    {%- endif %}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{- "<|im_start|>assistant\n" -}}
+{%- endif -%}
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -1825,6 +1825,104 @@ static void test_convert_responses_to_chatcmpl() {
    }
 }

+// Shared LFM2 parser cases - all variants use one output format and parser
+static void test_lfm2_parser(const std::string & template_path, bool detailed_debug) {
+    auto tst = peg_tester(template_path, detailed_debug);
+
+    // Basic content only
+    tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
+
+    // Single tool call without reasoning
+    tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
+        .tools({ special_function_tool })
+        .expect(message_assist_call)
+        .run();
+
+    // Tool call with string argument
+    tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
+        .tools({ get_time_tool })
+        .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
+        .run();
+
+    // Python literals become JSON
+    tst.test("<|tool_call_start|>[toggle(enabled=True)]<|tool_call_end|>")
+        .tools({ toggle_tool })
+        .expect(message_with_tool_calls("toggle", R"({"enabled": true})"))
+        .run();
+
+    tst.test("<|tool_call_start|>[set_nullable(value=None)]<|tool_call_end|>")
+        .tools({ nullable_tool })
+        .expect(message_with_tool_calls("set_nullable", R"({"value": null})"))
+        .run();
+
+    // Nested Python literal
+    tst.test("<|tool_call_start|>[set_config(config={\"enabled\": True, \"count\": 3})]<|tool_call_end|>")
+        .tools({ config_tool })
+        .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "count": 3}})"))
+        .run();
+
+    // JSON literals are accepted too
+    tst.test("<|tool_call_start|>[set_config(config={\"enabled\": true, \"note\": null})]<|tool_call_end|>")
+        .tools({ config_tool })
+        .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "note": null}})"))
+        .run();
+
+    // Dotted function name with structured args
+    tst.test("<|tool_call_start|>[Calendar.create_event(title=\"demo\", participants=[\"Alice\", \"Bob\"], "
+             "metadata={\"priority\": \"high\", \"reminder\": true})]<|tool_call_end|>")
+        .tools({ calendar_create_event_tool })
+        .expect(message_with_tool_calls(
+            "Calendar.create_event",
+            R"({"title": "demo", "participants": ["Alice", "Bob"], "metadata": {"priority": "high", "reminder": true}})"))
+        .run();
+
+    // Markdown links stay content
+    tst.test("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")
+        .tools({ get_time_tool })
+        .expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
+        .run();
+
+    // Python tool with multiline code in string
+    tst.test("<|tool_call_start|>[python(code=\"def hello():\\n    print('hey')\")]<|tool_call_end|>")
+        .tools({ python_tool })
+        .expect_tool_calls({
+            { "python", R"#({"code": "def hello():\\n    print('hey')"})#", "" }
+        })
+        .run();
+
+    // Content before tool call (no reasoning)
+    tst.test("Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
+        .tools({ get_time_tool })
+        .expect(message_with_reasoning_content_and_multiple_tool_calls(
+            "", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
+        ))
+        .run();
+
+    // Multiple tool calls (parallel)
+    tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
+        .parallel_tool_calls(true)
+        .tools({ special_function_tool, special_function_tool_with_optional_param })
+        .expect_tool_calls({
+            { "special_function", R"({"arg1": 1})", {} },
+            { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
+        })
+        .run();
+
+    // Partial tool call (streaming)
+    tst.test("<|tool_call_start|>[special_function(arg1=")
+        .tools({ special_function_tool })
+        .is_partial(true)
+        .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
+        .run();
+
+    // Tool call with empty arguments
+    tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
+        .tools({ empty_args_tool })
+        .expect(simple_assist_msg("", "", "empty_args", "{}"))
+        .run();
+
+}
+
 static void test_template_output_peg_parsers(bool detailed_debug) {
    LOG_DBG("%s\n", __func__);

@@ -4038,49 +4136,30 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .run();
    }

-    // LFM2-8B-A1B tests - uses <|tool_list_start|>/<|tool_list_end|> and <|tool_call_start|>[name(args)]<|tool_call_end|>
+    for (const char * tmpl : {
+             "models/templates/LFM2-8B-A1B.jinja",
+             "models/templates/LFM2.5-Instruct.jinja",
+             "models/templates/LFM2.5-8B-A1B.jinja",
+         }) {
+        test_lfm2_parser(tmpl, detailed_debug);
+    }
+
+    // Thinking cases only apply to LFM2.5-8B-A1B, the one LFM2 template that emits <think>
    {
-        auto tst = peg_tester("models/templates/LFM2-8B-A1B.jinja", detailed_debug);
+        auto tst = peg_tester("models/templates/LFM2.5-8B-A1B.jinja", detailed_debug);

-        // Basic content only
-        tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
+        // Reasoning is parsed independent of enable_thinking

-        // Single tool call without reasoning
-        tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
-            .tools({ special_function_tool })
-            .expect(message_assist_call)
-            .run();
-
-        // Tool call with string argument
-        tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
-            .tools({ get_time_tool })
-            .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
-            .run();
-
-        // Tool call with reasoning (enable_thinking=true)
+        // Tool call with reasoning
        tst.test("<think>I'm\nthinking</think><|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
-            .enable_thinking(true)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({ special_function_tool })
            .expect(message_assist_call_thoughts)
            .run();

-        // Multiple tool calls (parallel)
-        tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
-            .parallel_tool_calls(true)
-            .tools({
-                special_function_tool, special_function_tool_with_optional_param
-            })
-            .expect_tool_calls({
-                { "special_function", R"({"arg1": 1})", {} },
-                { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
-            })
-            .run();
-
        // Tool call with reasoning and content
        tst.test("<think>I need to call a function</think>"
                 "Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
-            .enable_thinking(true)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({ get_time_tool })
            .expect(message_with_reasoning_content_and_multiple_tool_calls(
@@ -4088,32 +4167,9 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            ))
            .run();

-        // Python tool with multiline code in string
-        tst.test("<|tool_call_start|>[python(code=\"def hello():\\n    print('hey')\")]<|tool_call_end|>")
-            .tools({ python_tool })
-            .expect_tool_calls({
-                { "python", R"#({"code": "def hello():\\n    print('hey')"})#", "" }
-            })
-            .run();
-
-        // Partial tool call (streaming)
-        tst.test("<|tool_call_start|>[special_function(arg1=")
-            .tools({ special_function_tool })
-            .is_partial(true)
-            .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
-            .run();
-
-        // Tool call with empty arguments
-        tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
-            .tools({ empty_args_tool })
-            .expect(simple_assist_msg("", "", "empty_args", "{}"))
-            .run();
-
-        // fake tool call marker in reasoning
-        tst.test(
-               "<think>Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm</think>"
-               "<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
-            .enable_thinking(true)
+        // Fake tool call marker inside reasoning is not parsed as a call
+        tst.test("<think>Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm</think>"
+                 "<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({ special_function_tool })
            .expect_reasoning("Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm")
@@ -4122,127 +4178,21 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            })
            .run();

-        // Continuation tests
-        tst.test("world!\nWhat's up?")
+        // enable_thinking=false still captures emitted reasoning
+        tst.test("<think>I'm\nthinking</think>Hello, world!\nWhat's up?")
+            .enable_thinking(false)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
-            .enable_thinking(true)
-            .messages({ message_user, message_assist_prefill_content })
-            .add_generation_prompt(false)
-            .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT)
-            .expect_reasoning("I'm thinking")
-            .expect_content("Hello, world!\nWhat's up?")
+            .expect(message_assist_thoughts)
            .run();

-        tst.test(" thinking</think>Hello, world!\nWhat's up?")
-            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
-            .enable_thinking(true)
-            .messages({ message_user, message_assist_prefill_reasoning })
-            .add_generation_prompt(false)
-            .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING)
-            .expect_reasoning("I'm thinking")
-            .expect_content("Hello, world!\nWhat's up?")
-            .run();
-    }
-
-    // LFM2.5 tests - format <|tool_call_start|>[name(args)]<|tool_call_end|>
-    {
-        auto tst = peg_tester("models/templates/LFM2.5-Instruct.jinja", detailed_debug);
-
-        // Basic content only
-        tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
-
-        // Single tool call without reasoning
-        tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
-            .tools({ special_function_tool })
-            .expect(message_assist_call)
-            .run();
-
-        // Tool call with string argument
-        tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
-            .tools({ get_time_tool })
-            .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
-            .run();
-
-        // Python literals become JSON.
-        tst.test("<|tool_call_start|>[toggle(enabled=True)]<|tool_call_end|>")
-            .tools({ toggle_tool })
-            .expect(message_with_tool_calls("toggle", R"({"enabled": true})"))
-            .run();
-
-        tst.test("<|tool_call_start|>[set_nullable(value=None)]<|tool_call_end|>")
-            .tools({ nullable_tool })
-            .expect(message_with_tool_calls("set_nullable", R"({"value": null})"))
-            .run();
-
-        // Nested Python literal.
-        tst.test("<|tool_call_start|>[set_config(config={\"enabled\": True, \"count\": 3})]<|tool_call_end|>")
-            .tools({ config_tool })
-            .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "count": 3}})"))
-            .run();
-
-        // JSON literals are accepted too.
-        tst.test("<|tool_call_start|>[set_config(config={\"enabled\": true, \"note\": null})]<|tool_call_end|>")
-            .tools({ config_tool })
-            .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "note": null}})"))
-            .run();
-
-        // Dotted function name with structured args.
-        tst.test("<|tool_call_start|>[Calendar.create_event(title=\"demo\", participants=[\"Alice\", \"Bob\"], "
-                 "metadata={\"priority\": \"high\", \"reminder\": true})]<|tool_call_end|>")
-            .tools({ calendar_create_event_tool })
-            .expect(message_with_tool_calls(
-                "Calendar.create_event",
-                R"({"title": "demo", "participants": ["Alice", "Bob"], "metadata": {"priority": "high", "reminder": true}})"))
-            .run();
-
-        // Markdown links stay content.
-        tst.test("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")
-            .tools({ get_time_tool })
-            .expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
-            .run();
-
-        // Tool call with reasoning (enable_thinking=true)
        tst.test("<think>I'm\nthinking</think><|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
-            .enable_thinking(true)
+            .enable_thinking(false)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({ special_function_tool })
            .expect(message_assist_call_thoughts)
            .run();

-        // Multiple tool calls (parallel)
-        tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
-            .parallel_tool_calls(true)
-            .tools({
-                special_function_tool, special_function_tool_with_optional_param
-            })
-            .expect_tool_calls({
-                { "special_function", R"({"arg1": 1})", {} },
-                { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
-            })
-            .run();
-
-        // Tool call with content before tool call
-        tst.test("Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
-            .tools({ get_time_tool })
-            .expect(message_with_reasoning_content_and_multiple_tool_calls(
-                "", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
-            ))
-            .run();
-
-        // Partial tool call (streaming)
-        tst.test("<|tool_call_start|>[special_function(arg1=")
-            .tools({ special_function_tool })
-            .is_partial(true)
-            .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
-            .run();
-
-        // Tool call with empty arguments
-        tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
-            .tools({ empty_args_tool })
-            .expect(simple_assist_msg("", "", "empty_args", "{}"))
-            .run();
-
-        // Continuation tests
+        // Continuation: prefill content
        tst.test("world!\nWhat's up?")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .enable_thinking(true)
@@ -4253,6 +4203,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .expect_content("Hello, world!\nWhat's up?")
            .run();

+        // Continuation: prefill reasoning
        tst.test(" thinking</think>Hello, world!\nWhat's up?")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .enable_thinking(true)
@@ -5478,18 +5429,25 @@ static void test_template_generation_prompt() {
        check(tmpls, continuation_reasoning(), "<|im_assistant|>assistant<|im_middle|><think>I'm");
    }

-    {
-        auto tmpls = read_templates("models/templates/LFM2-8B-A1B.jinja");
+    for (const char * tmpl : {
+             "models/templates/LFM2-8B-A1B.jinja",
+             "models/templates/LFM2.5-Instruct.jinja",
+             "models/templates/LFM2.5-8B-A1B.jinja",
+         }) {
+        auto tmpls = read_templates(tmpl);
        check(tmpls, basic(),                  "<|im_start|>assistant\n");
        check(tmpls, continuation_content(),   "<|im_start|>assistant\n<think>I'm thinking</think>Hello, ");
        check(tmpls, continuation_reasoning(), "<|im_start|>assistant\n<think>I'm");
    }

    {
-        auto tmpls = read_templates("models/templates/LFM2.5-Instruct.jinja");
-        check(tmpls, basic(),                  "<|im_start|>assistant\n");
-        check(tmpls, continuation_content(),   "<|im_start|>assistant\n<think>I'm thinking</think>Hello, ");
-        check(tmpls, continuation_reasoning(), "<|im_start|>assistant\n<think>I'm");
+        // 8B-A1B renders prior-turn reasoning via the "thinking" field
+        auto tmpls = read_templates("models/templates/LFM2.5-8B-A1B.jinja");
+        common_chat_templates_inputs inputs;
+        inputs.messages              = { message_user, message_assist_call_thoughts, tool_msg };
+        inputs.add_generation_prompt = true;
+        auto params = common_chat_templates_apply(tmpls.get(), inputs);
+        assert_contains(params.prompt, "<think>I'm\nthinking</think>");
    }

    {
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@@ -33,12 +33,8 @@
 #endif

 static llama_context           ** g_ctx;
-static llama_model             ** g_model;
 static common_sampler          ** g_smpl;
 static common_params            * g_params;
-static std::vector<llama_token> * g_input_tokens;
-static std::ostringstream       * g_output_ss;
-static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;

@@ -136,7 +132,6 @@ int llama_completion(int argc, char ** argv) {
    llama_context * ctx = nullptr;
    common_sampler * smpl = nullptr;

-    g_model = &model;
    g_ctx = &ctx;
    g_smpl = &smpl;

@@ -549,9 +544,9 @@ int llama_completion(int argc, char ** argv) {
    int n_consumed         = 0;
    int n_session_consumed = 0;

-    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
-    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
-    std::ostringstream output_ss;     g_output_ss     = &output_ss;
+    std::vector<int>   input_tokens;
+    std::vector<int>   output_tokens;
+    std::ostringstream output_ss;
    std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode

    // the first thing we will do is to output the prompt, so set color accordingly
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -37,6 +37,9 @@ struct clip_graph {
    float kq_scale; // TODO: maybe move this to hparams
    const clip_flash_attn_type flash_attn_type;

+    // TODO [QWEN_VIDEO]: improve this in the future
+    int n_batch = 1;
+
    ggml_context_ptr ctx0_ptr;
    ggml_context * ctx0;
    ggml_cgraph * gf;
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -480,10 +480,6 @@ struct clip_image_u8 {
        buf[idx + 2] = rgb[2];
    }

-    size_t n_pixels() const {
-        return (size_t) nx * (size_t) ny;
-    }
-
    size_t n_elements() const {
        return n_pixels() * 3;
    }
@@ -492,10 +488,16 @@ struct clip_image_u8 {
    std::vector<uint8_t> buf;
    int nx = 0;
    int ny = 0;
+
+    size_t n_pixels() const {
+        return (size_t) nx * (size_t) ny;
+    }
 };

 // For images, buf.size() == nx*ny*3
 //     Memory layout: RGBRGBRGB...
+// For seq, buf.size() == nx*ny*3*nt
+//     Memory layout: RGBRGB...RGBRGB... (nt times)
 // For audio, only one channel is used, buf.size() == nx*ny
 //     nx will be n_frames and ny will be n_mel
 struct clip_image_f32 {
@@ -544,10 +546,6 @@ struct clip_image_f32 {
        }
    }

-    size_t n_pixels() const {
-        return (size_t) nx_ * (size_t) ny_;
-    }
-
    size_t n_elements() const {
        return n_pixels() * 3;
    }
@@ -580,6 +578,10 @@ struct clip_image_f32 {
    std::vector<float> buf;
    int nx_ = 0;
    int ny_ = 0;
+
+    size_t n_pixels() const {
+        return (size_t) nx_ * (size_t) ny_;
+    }
 };

 //
@@ -627,6 +629,7 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
    va_end(args);
 }

+#define LOG_TRC(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
 #define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -527,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
 }

 ggml_tensor * clip_graph::build_inp_raw(int channels) {
-    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels);
+    ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels, n_batch);
    ggml_set_name(inp_raw, "inp_raw");
    ggml_set_input(inp_raw);
    return inp_raw;
@@ -848,8 +848,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
 }

 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
-    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
-
    const clip_image_f32 & img = *imgs.entries[0];
    std::unique_ptr<clip_graph> builder;

@@ -1009,6 +1007,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            GGML_ABORT("missing cgraph builder");
    }

+    // TODO [QWEN_VIDEO]: improve this in the future
+    builder->n_batch = imgs.entries.size();
+
    return builder->build();
 }

@@ -3479,12 +3480,15 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3

 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
    const clip_image_f32_batch & imgs = *imgs_c_ptr;
-    int batch_size = imgs.entries.size();
+    int n_batch_cur = imgs.entries.size();
+
+    // maximum supported batch size, usually == 2 for qwen-vl-based models
+    int n_batch_max = clip_model_n_batch_max(ctx);

    // TODO @ngxson : implement batch size > 1 as a loop
    //                we don't need true batching support because the cgraph will gonna be big anyway
-    if (batch_size != 1) {
-        return false; // only support batch size of 1
+    if (n_batch_cur > n_batch_max) {
+        return false;
    }

    // if buffers are not allocated, we need to do a warmup run to allocate them
@@ -3555,18 +3559,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        // └─────┘ │
        //   ──────┘ x B

-        for (size_t i = 0; i < imgs.entries.size(); i++) {
-            const int nx = imgs.entries[i]->nx();
-            const int ny = imgs.entries[i]->ny();
-            const int n = nx * ny;
+        // IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models
+        // All entries must have the same spatial size (enforced by can_batch_with() during merging)
+        {
+            const int nx = imgs.entries[0]->nx();
+            const int ny = imgs.entries[0]->ny();
+            const int n  = nx * ny;

-            for (int b = 0; b < batch_size; b++) {
+            for (int b = 0; b < n_batch_cur; b++) {
                const auto & buf = imgs.entries[b]->get_ro_buf();
                float * batch_entry = inp_raw.data() + b * (3*n);
                for (int y = 0; y < ny; y++) {
                    for (int x = 0; x < nx; x++) {
-                        size_t base_src = 3*(y * nx + x); // idx of the first channel
-                        size_t base_dst =    y * nx + x;  // idx of the first channel
+                        size_t base_src = 3*(y * nx + x);
+                        size_t base_dst =    y * nx + x;
                        batch_entry[      base_dst] = buf[base_src    ];
                        batch_entry[1*n + base_dst] = buf[base_src + 1];
                        batch_entry[2*n + base_dst] = buf[base_src + 2];
@@ -4549,6 +4555,17 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_AUDIO;
 }

+int clip_model_n_batch_max(const struct clip_ctx * ctx) {
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+            return 2;
+        default:
+            return 1;
+    }
+}
+
 //
 // API used internally with mtmd
 //
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -20,6 +20,12 @@ struct clip_image_size {
    bool operator==(const clip_image_size & other) const {
        return width == other.width && height == other.height;
    }
+    bool operator!=(const clip_image_size & other) const {
+        return !(*this == other);
+    }
+    int area() const {
+        return width * height;
+    }
 };

 struct clip_image_f32;
@@ -101,6 +107,8 @@ bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);

+int clip_model_n_batch_max(const struct clip_ctx * ctx);
+
 std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);

 struct clip_cap {
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -31,10 +31,11 @@ struct clip_graph_pixtral : clip_graph {
 struct clip_graph_qwen2vl : clip_graph {
    clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
+    ggml_tensor * build_inp_with_temporal_merge();
 };

-struct clip_graph_qwen3vl : clip_graph {
-    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+struct clip_graph_qwen3vl : clip_graph_qwen2vl {
+    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_qwen2vl(ctx, img) {}
    ggml_cgraph * build() override;
 };

--- a/tools/mtmd/models/qwen2vl.cpp
+++ b/tools/mtmd/models/qwen2vl.cpp
@@ -1,5 +1,34 @@
 #include "models.h"

+ggml_tensor * clip_graph_qwen2vl::build_inp_with_temporal_merge() {
+    ggml_tensor * inp_raw = build_inp_raw();
+
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
+
+    const size_t nb1 = ggml_row_size(inp_raw->type, img.nx());
+    const size_t nb2 = ggml_row_size(inp_raw->type, img.nx() * img.ny());
+
+    if (n_batch == 1) {
+        // still image input
+        return ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
+    } else if (n_batch == 2) {
+        // 2 frames input (video input)
+        ggml_tensor * inp_0 = ggml_view_3d(ctx0, inp_raw,
+                                    img.nx(), img.ny(), 3, nb1, nb2, 0);
+        ggml_tensor * inp_1 = ggml_view_3d(ctx0, inp_raw,
+                                    img.nx(), img.ny(), 3, nb1, nb2,
+                                    nb2 * 3); // move to the second frame
+        return ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_0, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_1, patch_size, patch_size, 0, 0, 1, 1));
+    } else {
+        GGML_ASSERT(false && "n_batch > 2 is not supported");
+    }
+}
+
 ggml_cgraph * clip_graph_qwen2vl::build() {
    GGML_ASSERT(model.patch_bias == nullptr);
    GGML_ASSERT(model.class_embedding == nullptr);
@@ -16,17 +45,10 @@ ggml_cgraph * clip_graph_qwen2vl::build() {

    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

-    ggml_tensor * inp_raw = build_inp_raw();
-    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
+    ggml_tensor * inp = build_inp_with_temporal_merge();

    // second conv dimension
    {
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
-
        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
        inp = ggml_cont_4d(
            ctx0, inp,
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@@ -13,17 +13,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() {

    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

-    ggml_tensor * inp_raw = build_inp_raw();
-    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    ggml_tensor * inp = build_inp_with_temporal_merge();

-    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
-
-    // second conv dimension
+    // spatial merge
    {
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
-
        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
        inp = ggml_cont_4d(
            ctx0, inp,
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -1116,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
    static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
    // TODO: support 512 (tiny) and 640 (small) once we have eval data for them

-    const int64_t orig_area = static_cast<int64_t>(img.n_pixels());
+    const int64_t orig_area = static_cast<int64_t>(img.get_size().area());

    size_t  mode_i   = 0;
    int64_t min_diff = std::numeric_limits<int64_t>::max();
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -24,10 +24,11 @@
 #include <climits>
 #include <vector>

-// represents raw image data, layout is RGBRGBRGB...
-// length of data must be nx * ny * 3
+// for still image data, layout is RGBRGBRGB...
+// length of data must be nx * ny * 3 bytes
+//
 // for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ...
-// length of data must be nx * sizeof(float)
+// length of data must be nx * sizeof(float) bytes
 struct mtmd_bitmap {
    uint32_t nx = 0;
    uint32_t ny = 0;
@@ -35,7 +36,7 @@ struct mtmd_bitmap {
    bool is_audio = false; // true if the bitmap is audio

    mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
-        : nx(nx), ny(ny) {
+        : nx(nx), ny(ny), is_audio(false) {
        if (data) {
            size_t data_size = (size_t)nx * ny * 3;
            this->data.resize(data_size);
@@ -64,6 +65,11 @@ struct mtmd_bitmap {
        return data.size();
    }

+    bool can_batch_with(const mtmd_bitmap & other) const {
+        // [QWEN_VIDEO] can batch if both are images with same size
+        return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny;
+    }
+
  private:
    std::vector<unsigned char> data;
 };
@@ -750,16 +756,55 @@ struct mtmd_tokenizer {
        cur.entries.clear();
        std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
        size_t i_bm = 0; // index of the current bitmap
+
+        // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
+        int n_merge_frames = 1;
+        if (ctx->ctx_v) {
+            n_merge_frames = clip_model_n_batch_max(ctx->ctx_v);
+            GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
+        }
+
+        std::vector<std::vector<const mtmd_bitmap *>> merged_bitmaps;
+        if (n_merge_frames > 1) {
+            size_t i_bm_scan = 0;
+            for (size_t i = 0; i < parts.size(); ++i) {
+                if (parts[i] != ctx->media_marker) {
+                    continue;
+                }
+                if (i + 1 < parts.size()
+                        && parts[i + 1] == ctx->media_marker
+                        && i_bm_scan + 1 < bitmaps.size()) {
+                    const mtmd_bitmap * bm_a = bitmaps[i_bm_scan];
+                    const mtmd_bitmap * bm_b = bitmaps[i_bm_scan + 1];
+                    if (bm_a->can_batch_with(*bm_b)) {
+                        LOG_DBG("%s: merging 2 frames at bitmap index %zu and %zu\n", __func__, i_bm_scan, i_bm_scan + 1);
+                        merged_bitmaps.push_back({bm_a, bm_b});
+                        parts.erase(parts.begin() + i + 1); // remove the second marker
+                        i_bm_scan += 2;
+                        continue;
+                    }
+                }
+                LOG_DBG("%s: no merging for bitmap index %zu\n", __func__, i_bm_scan);
+                merged_bitmaps.push_back({bitmaps[i_bm_scan]});
+                ++i_bm_scan;
+            }
+        } else {
+            for (size_t i = 0; i < bitmaps.size(); ++i) {
+                merged_bitmaps.push_back({bitmaps[i]});
+            }
+        }
+
+        i_bm = 0;
        for (auto & part : parts) {
            if (part == ctx->media_marker) {
                // this is a marker, we should add the next bitmap
-                if (i_bm >= bitmaps.size()) {
+                if (i_bm >= merged_bitmaps.size()) {
                    LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
-                            __func__, bitmaps.size(), parts.size() - 1);
+                            __func__, merged_bitmaps.size(), parts.size() - 1);
                    return 1;
                }
-                const mtmd_bitmap * bitmap = bitmaps[i_bm++];
-                int32_t res = add_media(bitmap);
+                auto & bmps = merged_bitmaps[i_bm++];
+                int32_t res = add_media(bmps);
                if (res != 0) {
                    return res;
                }
@@ -794,9 +839,9 @@ struct mtmd_tokenizer {
            }
        }

-        if (i_bm != bitmaps.size()) {
+        if (i_bm != merged_bitmaps.size()) {
            LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
-                    __func__, bitmaps.size(), parts.size() - 1);
+                    __func__, merged_bitmaps.size(), parts.size() - 1);
            return 1;
        }

@@ -835,8 +880,10 @@ struct mtmd_tokenizer {
        }
    }

-    int32_t add_media(const mtmd_bitmap * bitmap) {
-        if (!bitmap->is_audio) {
+    int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
+        GGML_ASSERT(!bitmaps.empty());
+
+        if (!bitmaps[0]->is_audio) {
            // handle image

            if (!ctx->ctx_v) {
@@ -848,27 +895,44 @@ struct mtmd_tokenizer {
                add_text(ctx->img_beg, true); // add image begin token
            }

-            // sanity check
-            if (bitmap->nx <= 0 || bitmap->ny <= 0) {
-                LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
-                        __func__, bitmap->nx, bitmap->ny);
-                return 2;
-            }
-            GGML_ASSERT(ctx->image_preproc != nullptr);
+            // TODO @ngxson : this is quite hacky because preprocessor only support batch with one single element, that need to be fixed in the future (e.g. by changing the preprocessor interface always take single input)

-            // convert mtmd_bitmap to clip_image_u8
-            clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->set_size(
-                {(int)bitmap->nx, (int)bitmap->ny},
-                bitmap->is_placeholder());
-            img_u8->cpy_buf(bitmap->get_ro_buf());
-
-            // preprocess image
            clip_image_f32_batch batch_f32;
-            bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
-            if (!ok) {
-                LOG_ERR("Unable to preprocess image\n");
-                return 2;
+
+            for (const auto * bmp : bitmaps) {
+                // sanity check
+                GGML_ASSERT(!bmp->is_audio);
+                GGML_ASSERT(ctx->image_preproc != nullptr);
+                if (bmp->nx <= 0 || bmp->ny <= 0) {
+                    LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
+                            __func__, bmp->nx, bmp->ny);
+                    return 2;
+                }
+
+                // convert mtmd_bitmap to clip_image_u8
+                clip_image_u8_ptr img_u8(clip_image_u8_init());
+                img_u8->set_size(
+                    {(int)bmp->nx, (int)bmp->ny},
+                    bmp->is_placeholder());
+                img_u8->cpy_buf(bmp->get_ro_buf());
+
+                // preprocess image
+                clip_image_f32_batch tmp_batch;
+                bool ok = ctx->image_preproc->preprocess(*img_u8, tmp_batch);
+                if (!ok) {
+                    LOG_ERR("Unable to preprocess image\n");
+                    return 2;
+                }
+
+                // move entries and grid dimensions to the "global" batch_f32
+                for (auto & entry : tmp_batch.entries) {
+                    batch_f32.entries.emplace_back(std::move(entry));
+                }
+
+                // for llava-uhd style, we need to handle grid too
+                // we don't care about overwriting these values for now because llama-uhd doesn't support batching anyway
+                batch_f32.grid_x = tmp_batch.grid_x;
+                batch_f32.grid_y = tmp_batch.grid_y;
            }

            // Annotate llava-next style tiles so clip_n_output_tokens accounts
@@ -896,11 +960,14 @@ struct mtmd_tokenizer {
                || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
                || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
            ) {
+                // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
+                GGML_ASSERT(bitmaps.size() == 1);
+
                const int n_col = batch_f32.grid_x;
                const int n_row = batch_f32.grid_y;
                // split batch into chunks of single images
                // NOTE: batch_f32 will be invalidated after this call
-                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
+                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[0]->id);
                GGML_ASSERT(chunks.size() > 0);

                auto ov_chunk = std::move(chunks.front());
@@ -954,6 +1021,10 @@ struct mtmd_tokenizer {
                size_t n_tokens = 0;
                for (const auto & e : batch_f32.entries) {
                    n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
+                    if (clip_model_n_batch_max(ctx->ctx_v) == 2) {
+                        // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
+                        break;
+                    }
                }

                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
@@ -976,7 +1047,7 @@ struct mtmd_tokenizer {
                    GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
                }
                image_tokens->batch_f32 = std::move(batch_f32);
-                image_tokens->id = bitmap->id; // optional
+                image_tokens->id = bitmaps[0]->id; // optional

                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@@ -1001,6 +1072,9 @@ struct mtmd_tokenizer {
        } else {
            // handle audio

+            GGML_ASSERT(bitmaps.size() == 1); // no batching support for now
+            auto & bitmap = bitmaps[0];
+
            if (!ctx->ctx_a) {
                LOG_ERR("%s: error: model does not support audio input\n", __func__);
                return 2;
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -133,6 +133,8 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
 // if bitmap is image:
 //     length of data must be nx * ny * 3
 //     the data is in RGBRGBRGB... format
+//     note: some video-capable models (i.e. qwen-vl) can merge consecutive bitmaps
+//           into one chunk, mtmd_tokenize() will automatically handle this
 // if bitmap is audio:
 //     length of data must be n_samples * sizeof(float)
 //     the data is in float format (PCM F32)
Author	SHA1	Message	Date
Sigbjørn Skjæret	8a091c47ab	spec : fix vocab compatibility check (#24256 )	2026-06-07 14:43:52 +03:00
konradmb	465b1f0e75	arg: Skip mmproj download when user supplied mmproj (#24239 )	2026-06-07 11:18:44 +02:00
Sigbjørn Skjæret	f71af352a5	convert : fix Gemma4 with no audio encoder (#24242 )	2026-06-07 08:43:05 +02:00
Sigbjørn Skjæret	3f7c79d7b5	docker : bump cuda13 to 13.3.0 (#24228 )	2026-06-07 08:31:58 +02:00
Tarek Dakhran	98d5e8ba8a	common/chat : fix LFM2/LFM2.5 reasoning round-trip and <think> leak (#24234 ) * common/chat : fix LFM2 reasoning round-trip and stray <think> leak * Gate by reasoning format and whether the template supports <think>	2026-06-06 22:39:21 +02:00
Xuan-Son Nguyen	31e82494c0	mtmd: support "frame merge" for qwen-vl-based models (#21858 ) * feat: add video support for Qwen3.5 * various clean up * revise the design * fix llava-uhd case * nits * nits 2 --------- Co-authored-by: andrewmd5 <1297077+andrewmd5@users.noreply.github.com>	2026-06-06 21:17:25 +02:00
Adrien Gallouët	6b80c74f28	completion : remove useless statics (#24226 ) Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-06 12:16:16 +02:00