Compare commits

...

7 Commits
b9541 ... b9548

Author SHA1 Message Date
Sigbjørn Skjæret
8a091c47ab spec : fix vocab compatibility check (#24256) 2026-06-07 14:43:52 +03:00
konradmb
465b1f0e75 arg: Skip mmproj download when user supplied mmproj (#24239) 2026-06-07 11:18:44 +02:00
Sigbjørn Skjæret
f71af352a5 convert : fix Gemma4 with no audio encoder (#24242) 2026-06-07 08:43:05 +02:00
Sigbjørn Skjæret
3f7c79d7b5 docker : bump cuda13 to 13.3.0 (#24228) 2026-06-07 08:31:58 +02:00
Tarek Dakhran
98d5e8ba8a common/chat : fix LFM2/LFM2.5 reasoning round-trip and <think> leak (#24234)
* common/chat : fix LFM2 reasoning round-trip and stray <think> leak
* Gate by reasoning format and whether the template supports <think>
2026-06-06 22:39:21 +02:00
Xuan-Son Nguyen
31e82494c0 mtmd: support "frame merge" for qwen-vl-based models (#21858)
* feat: add video support for Qwen3.5

* various clean up

* revise the design

* fix llava-uhd case

* nits

* nits 2

---------

Co-authored-by: andrewmd5 <1297077+andrewmd5@users.noreply.github.com>
2026-06-06 21:17:25 +02:00
Adrien Gallouët
6b80c74f28 completion : remove useless statics (#24226)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-06-06 12:16:16 +02:00
18 changed files with 473 additions and 270 deletions

View File

@@ -82,8 +82,8 @@ jobs:
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },

View File

@@ -444,7 +444,7 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
opts.offline = params.offline;
opts.skip_download = params.skip_download;
opts.download_mtp = spec_type_draft_mtp;
opts.download_mmproj = !params.no_mmproj;
opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();
// sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
// so we should not auto-discover mtp/mmproj siblings for them

View File

@@ -1625,8 +1625,17 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
const std::string THINK_END = "</think>";
const std::string GEN_PROMPT = "<|im_start|>assistant\n";
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
// Copy reasoning to the "thinking" field the template expects
auto adjusted_messages = json::array();
for (auto msg : inputs.messages) {
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
msg["thinking"] = msg.at("reasoning_content");
}
adjusted_messages.push_back(msg);
}
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs, adjusted_messages);
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, adjusted_messages);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END };
@@ -1639,7 +1648,9 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
data.thinking_end_tag = THINK_END;
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
// Gate by reasoning format and whether the template supports <think>
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
tmpl.source().find(THINK_START) != std::string::npos;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
if (inputs.has_continuation()) {
@@ -1658,7 +1669,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
auto end = p.end();
auto reasoning = p.eps();
if (extract_reasoning && inputs.enable_thinking) {
if (extract_reasoning) {
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
}

View File

@@ -58,10 +58,10 @@ static bool common_speculative_are_compatible(
const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
const auto vocab_type_tgt = llama_vocab_type(vocab_tgt);
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
const bool vocab_type_dft = llama_vocab_type(vocab_dft);
const auto vocab_type_dft = llama_vocab_type(vocab_dft);
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) {

View File

@@ -812,10 +812,11 @@ class Gemma4VisionAudioModel(MmprojModel):
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
# audio params
assert self.hparams_audio is not None
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
if self.has_audio_encoder:
assert self.hparams_audio is not None
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
def is_audio_tensor(self, name: str) -> bool:
return "audio_tower" in name or "embed_audio" in name

View File

@@ -0,0 +1,115 @@
{{- bos_token -}}
{%- set preserve_thinking = preserve_thinking | default(false) -%}
{%- macro format_arg_value(arg_value) -%}
{%- if arg_value is string -%}
{{- "'" + arg_value + "'" -}}
{%- elif arg_value is mapping -%}
{{- arg_value | tojson -}}
{%- else -%}
{{- arg_value | string -}}
{%- endif -%}
{%- endmacro -%}
{%- macro parse_content(content) -%}
{%- if content is string -%}
{{- content -}}
{%- else -%}
{%- set _ns = namespace(result="") -%}
{%- for item in content -%}
{%- if item["type"] == "image" -%}
{%- set _ns.result = _ns.result + "<image>" -%}
{%- elif item["type"] == "text" -%}
{%- set _ns.result = _ns.result + item["text"] -%}
{%- else -%}
{%- set _ns.result = _ns.result + item | tojson -%}
{%- endif -%}
{%- endfor -%}
{{- _ns.result -}}
{%- endif -%}
{%- endmacro -%}
{%- macro render_tool_calls(tool_calls) -%}
{%- set tool_calls_ns = namespace(tool_calls=[]) -%}
{%- for tool_call in tool_calls -%}
{%- set func_name = tool_call["function"]["name"] -%}
{%- set func_args = tool_call["function"]["arguments"] -%}
{%- set args_ns = namespace(arg_strings=[]) -%}
{%- for arg_name, arg_value in func_args.items() -%}
{%- set args_ns.arg_strings = args_ns.arg_strings + [arg_name + "=" + format_arg_value(arg_value)] -%}
{%- endfor -%}
{%- set tool_calls_ns.tool_calls = tool_calls_ns.tool_calls + [func_name + "(" + (args_ns.arg_strings | join(", ")) + ")"] -%}
{%- endfor -%}
{{- "<|tool_call_start|>[" + (tool_calls_ns.tool_calls | join(", ")) + "]<|tool_call_end|>" -}}
{%- endmacro -%}
{%- set ns = namespace(system_prompt="", last_user_index=-1) -%}
{%- if messages[0]["role"] == "system" -%}
{%- if messages[0].get("content") -%}
{%- set ns.system_prompt = parse_content(messages[0]["content"]) -%}
{%- endif -%}
{%- set messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
{%- for tool in tools -%}
{%- if tool is not string -%}
{%- set tool = tool | tojson -%}
{%- endif -%}
{%- set ns.system_prompt = ns.system_prompt + tool -%}
{%- if not loop.last -%}
{%- set ns.system_prompt = ns.system_prompt + ", " -%}
{%- endif -%}
{%- endfor -%}
{%- set ns.system_prompt = ns.system_prompt + "]" -%}
{%- endif -%}
{%- if ns.system_prompt -%}
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
{%- endif -%}
{%- for message in messages -%}
{%- if message["role"] == "user" -%}
{%- set ns.last_user_index = loop.index0 -%}
{%- endif -%}
{%- endfor -%}
{%- for message in messages -%}
{{- "<|im_start|>" + message.role + "\n" -}}
{%- if message.role == "assistant" -%}
{%- generation -%}
{%- if message.thinking is defined and (preserve_thinking or loop.index0 > ns.last_user_index) -%}
{{- "<think>" + message.thinking + "</think>" -}}
{%- endif -%}
{%- set _cfm_tag = "CONTINUE_FINAL_MESSAGE_TAG " -%}
{%- set _has_cfm = false -%}
{%- if message.content is defined -%}
{%- set content = parse_content(message.content) -%}
{%- if not (preserve_thinking or loop.index0 > ns.last_user_index) -%}
{%- if "</think>" in content -%}
{%- set content = content.split("</think>")[-1] | trim -%}
{%- endif -%}
{%- endif -%}
{%- if message.tool_calls is defined and content.endswith(_cfm_tag) -%}
{%- set _has_cfm = true -%}
{%- set _trunc_len = (content | length) - (_cfm_tag | length) -%}
{{- content[:_trunc_len] -}}
{%- else -%}
{{- content -}}
{%- endif -%}
{%- endif -%}
{%- if message.tool_calls is defined -%}
{{- render_tool_calls(message.tool_calls) -}}
{%- endif -%}
{%- if _has_cfm -%}
{{- _cfm_tag -}}
{%- endif -%}
{{- "<|im_end|>\n" -}}
{%- endgeneration -%}
{%- else %}
{%- if message.get("content") -%}
{{- parse_content(message["content"]) -}}
{%- endif -%}
{{- "<|im_end|>\n" -}}
{%- endif %}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- "<|im_start|>assistant\n" -}}
{%- endif -%}

View File

@@ -1825,6 +1825,104 @@ static void test_convert_responses_to_chatcmpl() {
}
}
// Shared LFM2 parser cases - all variants use one output format and parser
static void test_lfm2_parser(const std::string & template_path, bool detailed_debug) {
auto tst = peg_tester(template_path, detailed_debug);
// Basic content only
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
// Single tool call without reasoning
tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.tools({ special_function_tool })
.expect(message_assist_call)
.run();
// Tool call with string argument
tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
.tools({ get_time_tool })
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
.run();
// Python literals become JSON
tst.test("<|tool_call_start|>[toggle(enabled=True)]<|tool_call_end|>")
.tools({ toggle_tool })
.expect(message_with_tool_calls("toggle", R"({"enabled": true})"))
.run();
tst.test("<|tool_call_start|>[set_nullable(value=None)]<|tool_call_end|>")
.tools({ nullable_tool })
.expect(message_with_tool_calls("set_nullable", R"({"value": null})"))
.run();
// Nested Python literal
tst.test("<|tool_call_start|>[set_config(config={\"enabled\": True, \"count\": 3})]<|tool_call_end|>")
.tools({ config_tool })
.expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "count": 3}})"))
.run();
// JSON literals are accepted too
tst.test("<|tool_call_start|>[set_config(config={\"enabled\": true, \"note\": null})]<|tool_call_end|>")
.tools({ config_tool })
.expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "note": null}})"))
.run();
// Dotted function name with structured args
tst.test("<|tool_call_start|>[Calendar.create_event(title=\"demo\", participants=[\"Alice\", \"Bob\"], "
"metadata={\"priority\": \"high\", \"reminder\": true})]<|tool_call_end|>")
.tools({ calendar_create_event_tool })
.expect(message_with_tool_calls(
"Calendar.create_event",
R"({"title": "demo", "participants": ["Alice", "Bob"], "metadata": {"priority": "high", "reminder": true}})"))
.run();
// Markdown links stay content
tst.test("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")
.tools({ get_time_tool })
.expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
.run();
// Python tool with multiline code in string
tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>")
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "def hello():\\n print('hey')"})#", "" }
})
.run();
// Content before tool call (no reasoning)
tst.test("Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
.tools({ get_time_tool })
.expect(message_with_reasoning_content_and_multiple_tool_calls(
"", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
))
.run();
// Multiple tool calls (parallel)
tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
.parallel_tool_calls(true)
.tools({ special_function_tool, special_function_tool_with_optional_param })
.expect_tool_calls({
{ "special_function", R"({"arg1": 1})", {} },
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
})
.run();
// Partial tool call (streaming)
tst.test("<|tool_call_start|>[special_function(arg1=")
.tools({ special_function_tool })
.is_partial(true)
.expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
.run();
// Tool call with empty arguments
tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
.tools({ empty_args_tool })
.expect(simple_assist_msg("", "", "empty_args", "{}"))
.run();
}
static void test_template_output_peg_parsers(bool detailed_debug) {
LOG_DBG("%s\n", __func__);
@@ -4038,49 +4136,30 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.run();
}
// LFM2-8B-A1B tests - uses <|tool_list_start|>/<|tool_list_end|> and <|tool_call_start|>[name(args)]<|tool_call_end|>
for (const char * tmpl : {
"models/templates/LFM2-8B-A1B.jinja",
"models/templates/LFM2.5-Instruct.jinja",
"models/templates/LFM2.5-8B-A1B.jinja",
}) {
test_lfm2_parser(tmpl, detailed_debug);
}
// Thinking cases only apply to LFM2.5-8B-A1B, the one LFM2 template that emits <think>
{
auto tst = peg_tester("models/templates/LFM2-8B-A1B.jinja", detailed_debug);
auto tst = peg_tester("models/templates/LFM2.5-8B-A1B.jinja", detailed_debug);
// Basic content only
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
// Reasoning is parsed independent of enable_thinking
// Single tool call without reasoning
tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.tools({ special_function_tool })
.expect(message_assist_call)
.run();
// Tool call with string argument
tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
.tools({ get_time_tool })
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
.run();
// Tool call with reasoning (enable_thinking=true)
// Tool call with reasoning
tst.test("<think>I'm\nthinking</think><|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect(message_assist_call_thoughts)
.run();
// Multiple tool calls (parallel)
tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
.parallel_tool_calls(true)
.tools({
special_function_tool, special_function_tool_with_optional_param
})
.expect_tool_calls({
{ "special_function", R"({"arg1": 1})", {} },
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
})
.run();
// Tool call with reasoning and content
tst.test("<think>I need to call a function</think>"
"Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ get_time_tool })
.expect(message_with_reasoning_content_and_multiple_tool_calls(
@@ -4088,32 +4167,9 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
))
.run();
// Python tool with multiline code in string
tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>")
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "def hello():\\n print('hey')"})#", "" }
})
.run();
// Partial tool call (streaming)
tst.test("<|tool_call_start|>[special_function(arg1=")
.tools({ special_function_tool })
.is_partial(true)
.expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
.run();
// Tool call with empty arguments
tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
.tools({ empty_args_tool })
.expect(simple_assist_msg("", "", "empty_args", "{}"))
.run();
// fake tool call marker in reasoning
tst.test(
"<think>Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm</think>"
"<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.enable_thinking(true)
// Fake tool call marker inside reasoning is not parsed as a call
tst.test("<think>Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm</think>"
"<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect_reasoning("Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm")
@@ -4122,127 +4178,21 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
})
.run();
// Continuation tests
tst.test("world!\nWhat's up?")
// enable_thinking=false still captures emitted reasoning
tst.test("<think>I'm\nthinking</think>Hello, world!\nWhat's up?")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.enable_thinking(true)
.messages({ message_user, message_assist_prefill_content })
.add_generation_prompt(false)
.continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT)
.expect_reasoning("I'm thinking")
.expect_content("Hello, world!\nWhat's up?")
.expect(message_assist_thoughts)
.run();
tst.test(" thinking</think>Hello, world!\nWhat's up?")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.enable_thinking(true)
.messages({ message_user, message_assist_prefill_reasoning })
.add_generation_prompt(false)
.continue_final_message(COMMON_CHAT_CONTINUATION_REASONING)
.expect_reasoning("I'm thinking")
.expect_content("Hello, world!\nWhat's up?")
.run();
}
// LFM2.5 tests - format <|tool_call_start|>[name(args)]<|tool_call_end|>
{
auto tst = peg_tester("models/templates/LFM2.5-Instruct.jinja", detailed_debug);
// Basic content only
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
// Single tool call without reasoning
tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.tools({ special_function_tool })
.expect(message_assist_call)
.run();
// Tool call with string argument
tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
.tools({ get_time_tool })
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
.run();
// Python literals become JSON.
tst.test("<|tool_call_start|>[toggle(enabled=True)]<|tool_call_end|>")
.tools({ toggle_tool })
.expect(message_with_tool_calls("toggle", R"({"enabled": true})"))
.run();
tst.test("<|tool_call_start|>[set_nullable(value=None)]<|tool_call_end|>")
.tools({ nullable_tool })
.expect(message_with_tool_calls("set_nullable", R"({"value": null})"))
.run();
// Nested Python literal.
tst.test("<|tool_call_start|>[set_config(config={\"enabled\": True, \"count\": 3})]<|tool_call_end|>")
.tools({ config_tool })
.expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "count": 3}})"))
.run();
// JSON literals are accepted too.
tst.test("<|tool_call_start|>[set_config(config={\"enabled\": true, \"note\": null})]<|tool_call_end|>")
.tools({ config_tool })
.expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "note": null}})"))
.run();
// Dotted function name with structured args.
tst.test("<|tool_call_start|>[Calendar.create_event(title=\"demo\", participants=[\"Alice\", \"Bob\"], "
"metadata={\"priority\": \"high\", \"reminder\": true})]<|tool_call_end|>")
.tools({ calendar_create_event_tool })
.expect(message_with_tool_calls(
"Calendar.create_event",
R"({"title": "demo", "participants": ["Alice", "Bob"], "metadata": {"priority": "high", "reminder": true}})"))
.run();
// Markdown links stay content.
tst.test("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")
.tools({ get_time_tool })
.expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
.run();
// Tool call with reasoning (enable_thinking=true)
tst.test("<think>I'm\nthinking</think><|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.enable_thinking(true)
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect(message_assist_call_thoughts)
.run();
// Multiple tool calls (parallel)
tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
.parallel_tool_calls(true)
.tools({
special_function_tool, special_function_tool_with_optional_param
})
.expect_tool_calls({
{ "special_function", R"({"arg1": 1})", {} },
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
})
.run();
// Tool call with content before tool call
tst.test("Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
.tools({ get_time_tool })
.expect(message_with_reasoning_content_and_multiple_tool_calls(
"", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
))
.run();
// Partial tool call (streaming)
tst.test("<|tool_call_start|>[special_function(arg1=")
.tools({ special_function_tool })
.is_partial(true)
.expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
.run();
// Tool call with empty arguments
tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
.tools({ empty_args_tool })
.expect(simple_assist_msg("", "", "empty_args", "{}"))
.run();
// Continuation tests
// Continuation: prefill content
tst.test("world!\nWhat's up?")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.enable_thinking(true)
@@ -4253,6 +4203,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.expect_content("Hello, world!\nWhat's up?")
.run();
// Continuation: prefill reasoning
tst.test(" thinking</think>Hello, world!\nWhat's up?")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.enable_thinking(true)
@@ -5478,18 +5429,25 @@ static void test_template_generation_prompt() {
check(tmpls, continuation_reasoning(), "<|im_assistant|>assistant<|im_middle|><think>I'm");
}
{
auto tmpls = read_templates("models/templates/LFM2-8B-A1B.jinja");
for (const char * tmpl : {
"models/templates/LFM2-8B-A1B.jinja",
"models/templates/LFM2.5-Instruct.jinja",
"models/templates/LFM2.5-8B-A1B.jinja",
}) {
auto tmpls = read_templates(tmpl);
check(tmpls, basic(), "<|im_start|>assistant\n");
check(tmpls, continuation_content(), "<|im_start|>assistant\n<think>I'm thinking</think>Hello, ");
check(tmpls, continuation_reasoning(), "<|im_start|>assistant\n<think>I'm");
}
{
auto tmpls = read_templates("models/templates/LFM2.5-Instruct.jinja");
check(tmpls, basic(), "<|im_start|>assistant\n");
check(tmpls, continuation_content(), "<|im_start|>assistant\n<think>I'm thinking</think>Hello, ");
check(tmpls, continuation_reasoning(), "<|im_start|>assistant\n<think>I'm");
// 8B-A1B renders prior-turn reasoning via the "thinking" field
auto tmpls = read_templates("models/templates/LFM2.5-8B-A1B.jinja");
common_chat_templates_inputs inputs;
inputs.messages = { message_user, message_assist_call_thoughts, tool_msg };
inputs.add_generation_prompt = true;
auto params = common_chat_templates_apply(tmpls.get(), inputs);
assert_contains(params.prompt, "<think>I'm\nthinking</think>");
}
{

View File

@@ -33,12 +33,8 @@
#endif
static llama_context ** g_ctx;
static llama_model ** g_model;
static common_sampler ** g_smpl;
static common_params * g_params;
static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static bool need_insert_eot = false;
@@ -136,7 +132,6 @@ int llama_completion(int argc, char ** argv) {
llama_context * ctx = nullptr;
common_sampler * smpl = nullptr;
g_model = &model;
g_ctx = &ctx;
g_smpl = &smpl;
@@ -549,9 +544,9 @@ int llama_completion(int argc, char ** argv) {
int n_consumed = 0;
int n_session_consumed = 0;
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
std::ostringstream output_ss; g_output_ss = &output_ss;
std::vector<int> input_tokens;
std::vector<int> output_tokens;
std::ostringstream output_ss;
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
// the first thing we will do is to output the prompt, so set color accordingly

View File

@@ -37,6 +37,9 @@ struct clip_graph {
float kq_scale; // TODO: maybe move this to hparams
const clip_flash_attn_type flash_attn_type;
// TODO [QWEN_VIDEO]: improve this in the future
int n_batch = 1;
ggml_context_ptr ctx0_ptr;
ggml_context * ctx0;
ggml_cgraph * gf;

View File

@@ -480,10 +480,6 @@ struct clip_image_u8 {
buf[idx + 2] = rgb[2];
}
size_t n_pixels() const {
return (size_t) nx * (size_t) ny;
}
size_t n_elements() const {
return n_pixels() * 3;
}
@@ -492,10 +488,16 @@ struct clip_image_u8 {
std::vector<uint8_t> buf;
int nx = 0;
int ny = 0;
size_t n_pixels() const {
return (size_t) nx * (size_t) ny;
}
};
// For images, buf.size() == nx*ny*3
// Memory layout: RGBRGBRGB...
// For seq, buf.size() == nx*ny*3*nt
// Memory layout: RGBRGB...RGBRGB... (nt times)
// For audio, only one channel is used, buf.size() == nx*ny
// nx will be n_frames and ny will be n_mel
struct clip_image_f32 {
@@ -544,10 +546,6 @@ struct clip_image_f32 {
}
}
size_t n_pixels() const {
return (size_t) nx_ * (size_t) ny_;
}
size_t n_elements() const {
return n_pixels() * 3;
}
@@ -580,6 +578,10 @@ struct clip_image_f32 {
std::vector<float> buf;
int nx_ = 0;
int ny_ = 0;
size_t n_pixels() const {
return (size_t) nx_ * (size_t) ny_;
}
};
//
@@ -627,6 +629,7 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
va_end(args);
}
#define LOG_TRC(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__)

View File

@@ -527,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
}
ggml_tensor * clip_graph::build_inp_raw(int channels) {
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels);
ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels, n_batch);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
return inp_raw;
@@ -848,8 +848,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
}
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
const clip_image_f32 & img = *imgs.entries[0];
std::unique_ptr<clip_graph> builder;
@@ -1009,6 +1007,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
GGML_ABORT("missing cgraph builder");
}
// TODO [QWEN_VIDEO]: improve this in the future
builder->n_batch = imgs.entries.size();
return builder->build();
}
@@ -3479,12 +3480,15 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
const clip_image_f32_batch & imgs = *imgs_c_ptr;
int batch_size = imgs.entries.size();
int n_batch_cur = imgs.entries.size();
// maximum supported batch size, usually == 2 for qwen-vl-based models
int n_batch_max = clip_model_n_batch_max(ctx);
// TODO @ngxson : implement batch size > 1 as a loop
// we don't need true batching support because the cgraph will gonna be big anyway
if (batch_size != 1) {
return false; // only support batch size of 1
if (n_batch_cur > n_batch_max) {
return false;
}
// if buffers are not allocated, we need to do a warmup run to allocate them
@@ -3555,18 +3559,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// └─────┘ │
// ──────┘ x B
for (size_t i = 0; i < imgs.entries.size(); i++) {
const int nx = imgs.entries[i]->nx();
const int ny = imgs.entries[i]->ny();
const int n = nx * ny;
// IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models
// All entries must have the same spatial size (enforced by can_batch_with() during merging)
{
const int nx = imgs.entries[0]->nx();
const int ny = imgs.entries[0]->ny();
const int n = nx * ny;
for (int b = 0; b < batch_size; b++) {
for (int b = 0; b < n_batch_cur; b++) {
const auto & buf = imgs.entries[b]->get_ro_buf();
float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
size_t base_src = 3*(y * nx + x); // idx of the first channel
size_t base_dst = y * nx + x; // idx of the first channel
size_t base_src = 3*(y * nx + x);
size_t base_dst = y * nx + x;
batch_entry[ base_dst] = buf[base_src ];
batch_entry[1*n + base_dst] = buf[base_src + 1];
batch_entry[2*n + base_dst] = buf[base_src + 2];
@@ -4549,6 +4555,17 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_AUDIO;
}
int clip_model_n_batch_max(const struct clip_ctx * ctx) {
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
return 2;
default:
return 1;
}
}
//
// API used internally with mtmd
//

View File

@@ -20,6 +20,12 @@ struct clip_image_size {
bool operator==(const clip_image_size & other) const {
return width == other.width && height == other.height;
}
bool operator!=(const clip_image_size & other) const {
return !(*this == other);
}
int area() const {
return width * height;
}
};
struct clip_image_f32;
@@ -101,6 +107,8 @@ bool clip_is_llava(const struct clip_ctx * ctx);
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
int clip_model_n_batch_max(const struct clip_ctx * ctx);
std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
struct clip_cap {

View File

@@ -31,10 +31,11 @@ struct clip_graph_pixtral : clip_graph {
struct clip_graph_qwen2vl : clip_graph {
clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * build_inp_with_temporal_merge();
};
struct clip_graph_qwen3vl : clip_graph {
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
struct clip_graph_qwen3vl : clip_graph_qwen2vl {
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_qwen2vl(ctx, img) {}
ggml_cgraph * build() override;
};

View File

@@ -1,5 +1,34 @@
#include "models.h"
ggml_tensor * clip_graph_qwen2vl::build_inp_with_temporal_merge() {
ggml_tensor * inp_raw = build_inp_raw();
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
const size_t nb1 = ggml_row_size(inp_raw->type, img.nx());
const size_t nb2 = ggml_row_size(inp_raw->type, img.nx() * img.ny());
if (n_batch == 1) {
// still image input
return ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
} else if (n_batch == 2) {
// 2 frames input (video input)
ggml_tensor * inp_0 = ggml_view_3d(ctx0, inp_raw,
img.nx(), img.ny(), 3, nb1, nb2, 0);
ggml_tensor * inp_1 = ggml_view_3d(ctx0, inp_raw,
img.nx(), img.ny(), 3, nb1, nb2,
nb2 * 3); // move to the second frame
return ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_0, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_1, patch_size, patch_size, 0, 0, 1, 1));
} else {
GGML_ASSERT(false && "n_batch > 2 is not supported");
}
}
ggml_cgraph * clip_graph_qwen2vl::build() {
GGML_ASSERT(model.patch_bias == nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
@@ -16,17 +45,10 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
ggml_tensor * inp = build_inp_with_temporal_merge();
// second conv dimension
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,

View File

@@ -13,17 +13,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
ggml_tensor * inp = build_inp_with_temporal_merge();
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
// second conv dimension
// spatial merge
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,

View File

@@ -1116,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
// TODO: support 512 (tiny) and 640 (small) once we have eval data for them
const int64_t orig_area = static_cast<int64_t>(img.n_pixels());
const int64_t orig_area = static_cast<int64_t>(img.get_size().area());
size_t mode_i = 0;
int64_t min_diff = std::numeric_limits<int64_t>::max();

View File

@@ -24,10 +24,11 @@
#include <climits>
#include <vector>
// represents raw image data, layout is RGBRGBRGB...
// length of data must be nx * ny * 3
// for still image data, layout is RGBRGBRGB...
// length of data must be nx * ny * 3 bytes
//
// for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ...
// length of data must be nx * sizeof(float)
// length of data must be nx * sizeof(float) bytes
struct mtmd_bitmap {
uint32_t nx = 0;
uint32_t ny = 0;
@@ -35,7 +36,7 @@ struct mtmd_bitmap {
bool is_audio = false; // true if the bitmap is audio
mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
: nx(nx), ny(ny) {
: nx(nx), ny(ny), is_audio(false) {
if (data) {
size_t data_size = (size_t)nx * ny * 3;
this->data.resize(data_size);
@@ -64,6 +65,11 @@ struct mtmd_bitmap {
return data.size();
}
bool can_batch_with(const mtmd_bitmap & other) const {
// [QWEN_VIDEO] can batch if both are images with same size
return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny;
}
private:
std::vector<unsigned char> data;
};
@@ -750,16 +756,55 @@ struct mtmd_tokenizer {
cur.entries.clear();
std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
size_t i_bm = 0; // index of the current bitmap
// [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
int n_merge_frames = 1;
if (ctx->ctx_v) {
n_merge_frames = clip_model_n_batch_max(ctx->ctx_v);
GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
}
std::vector<std::vector<const mtmd_bitmap *>> merged_bitmaps;
if (n_merge_frames > 1) {
size_t i_bm_scan = 0;
for (size_t i = 0; i < parts.size(); ++i) {
if (parts[i] != ctx->media_marker) {
continue;
}
if (i + 1 < parts.size()
&& parts[i + 1] == ctx->media_marker
&& i_bm_scan + 1 < bitmaps.size()) {
const mtmd_bitmap * bm_a = bitmaps[i_bm_scan];
const mtmd_bitmap * bm_b = bitmaps[i_bm_scan + 1];
if (bm_a->can_batch_with(*bm_b)) {
LOG_DBG("%s: merging 2 frames at bitmap index %zu and %zu\n", __func__, i_bm_scan, i_bm_scan + 1);
merged_bitmaps.push_back({bm_a, bm_b});
parts.erase(parts.begin() + i + 1); // remove the second marker
i_bm_scan += 2;
continue;
}
}
LOG_DBG("%s: no merging for bitmap index %zu\n", __func__, i_bm_scan);
merged_bitmaps.push_back({bitmaps[i_bm_scan]});
++i_bm_scan;
}
} else {
for (size_t i = 0; i < bitmaps.size(); ++i) {
merged_bitmaps.push_back({bitmaps[i]});
}
}
i_bm = 0;
for (auto & part : parts) {
if (part == ctx->media_marker) {
// this is a marker, we should add the next bitmap
if (i_bm >= bitmaps.size()) {
if (i_bm >= merged_bitmaps.size()) {
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
__func__, bitmaps.size(), parts.size() - 1);
__func__, merged_bitmaps.size(), parts.size() - 1);
return 1;
}
const mtmd_bitmap * bitmap = bitmaps[i_bm++];
int32_t res = add_media(bitmap);
auto & bmps = merged_bitmaps[i_bm++];
int32_t res = add_media(bmps);
if (res != 0) {
return res;
}
@@ -794,9 +839,9 @@ struct mtmd_tokenizer {
}
}
if (i_bm != bitmaps.size()) {
if (i_bm != merged_bitmaps.size()) {
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
__func__, bitmaps.size(), parts.size() - 1);
__func__, merged_bitmaps.size(), parts.size() - 1);
return 1;
}
@@ -835,8 +880,10 @@ struct mtmd_tokenizer {
}
}
int32_t add_media(const mtmd_bitmap * bitmap) {
if (!bitmap->is_audio) {
int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
GGML_ASSERT(!bitmaps.empty());
if (!bitmaps[0]->is_audio) {
// handle image
if (!ctx->ctx_v) {
@@ -848,27 +895,44 @@ struct mtmd_tokenizer {
add_text(ctx->img_beg, true); // add image begin token
}
// sanity check
if (bitmap->nx <= 0 || bitmap->ny <= 0) {
LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
__func__, bitmap->nx, bitmap->ny);
return 2;
}
GGML_ASSERT(ctx->image_preproc != nullptr);
// TODO @ngxson : this is quite hacky because preprocessor only support batch with one single element, that need to be fixed in the future (e.g. by changing the preprocessor interface always take single input)
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->set_size(
{(int)bitmap->nx, (int)bitmap->ny},
bitmap->is_placeholder());
img_u8->cpy_buf(bitmap->get_ro_buf());
// preprocess image
clip_image_f32_batch batch_f32;
bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
return 2;
for (const auto * bmp : bitmaps) {
// sanity check
GGML_ASSERT(!bmp->is_audio);
GGML_ASSERT(ctx->image_preproc != nullptr);
if (bmp->nx <= 0 || bmp->ny <= 0) {
LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
__func__, bmp->nx, bmp->ny);
return 2;
}
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->set_size(
{(int)bmp->nx, (int)bmp->ny},
bmp->is_placeholder());
img_u8->cpy_buf(bmp->get_ro_buf());
// preprocess image
clip_image_f32_batch tmp_batch;
bool ok = ctx->image_preproc->preprocess(*img_u8, tmp_batch);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
return 2;
}
// move entries and grid dimensions to the "global" batch_f32
for (auto & entry : tmp_batch.entries) {
batch_f32.entries.emplace_back(std::move(entry));
}
// for llava-uhd style, we need to handle grid too
// we don't care about overwriting these values for now because llama-uhd doesn't support batching anyway
batch_f32.grid_x = tmp_batch.grid_x;
batch_f32.grid_y = tmp_batch.grid_y;
}
// Annotate llava-next style tiles so clip_n_output_tokens accounts
@@ -896,11 +960,14 @@ struct mtmd_tokenizer {
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
) {
// [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
GGML_ASSERT(bitmaps.size() == 1);
const int n_col = batch_f32.grid_x;
const int n_row = batch_f32.grid_y;
// split batch into chunks of single images
// NOTE: batch_f32 will be invalidated after this call
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[0]->id);
GGML_ASSERT(chunks.size() > 0);
auto ov_chunk = std::move(chunks.front());
@@ -954,6 +1021,10 @@ struct mtmd_tokenizer {
size_t n_tokens = 0;
for (const auto & e : batch_f32.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
if (clip_model_n_batch_max(ctx->ctx_v) == 2) {
// [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
break;
}
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
@@ -976,7 +1047,7 @@ struct mtmd_tokenizer {
GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmap->id; // optional
image_tokens->id = bitmaps[0]->id; // optional
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@@ -1001,6 +1072,9 @@ struct mtmd_tokenizer {
} else {
// handle audio
GGML_ASSERT(bitmaps.size() == 1); // no batching support for now
auto & bitmap = bitmaps[0];
if (!ctx->ctx_a) {
LOG_ERR("%s: error: model does not support audio input\n", __func__);
return 2;

View File

@@ -133,6 +133,8 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
// if bitmap is image:
// length of data must be nx * ny * 3
// the data is in RGBRGBRGB... format
// note: some video-capable models (i.e. qwen-vl) can merge consecutive bitmaps
// into one chunk, mtmd_tokenize() will automatically handle this
// if bitmap is audio:
// length of data must be n_samples * sizeof(float)
// the data is in float format (PCM F32)