Compare commits

...

5 Commits

Author SHA1 Message Date
Xuan-Son Nguyen b3fed31b99 jinja, chat: add --reasoning-preserve flag (#25105)
* jinja, chat: add --reasoning-preserve flag

* correct help message
2026-06-28 23:33:51 +02:00
Aleksander Grygier dbdaece23d Revert "ui: fix accessibility for hover-gated interactive elements assisted by claude(in debugging and tests) (#24727)" (#25098) 2026-06-28 21:30:03 +02:00
Pascal 7cb8576e7c ui: fix stop and reasoning skip in single-model mode (#25084) 2026-06-28 21:06:43 +02:00
Ruixiang Wang fa72bc6826 dflash: refactor draft model conversion (#25110)
* dflash: refactor draft model conversion

* apply fix for eagle3 convert
2026-06-28 20:31:48 +02:00
Aldehir Rojas c818263f2a chat : implement minicpm5 parser (#24889)
* Add minicpm5 tool call parser

* Refactor MiniCPM5 PEG parser per review feedback

* Fix jinja min/max API to match Jinja2

* modify by review

* MiniCPM5: use autoparser for XML tool calls and fix grammar preserved-token triggers

* MiniCPM5: fix streaming tool-arg placeholder and remove alt XML markers

* skip min/max attribute tests in -py mode

* test-jinja: use real expected output for min/max attribute tests

* MiniCPM5: revert shared mapper and history fallbacks per review

Drop streaming tool-arg placeholder workarounds from the generic PEG
mapper and restore strict tool-call argument JSON parsing so MiniCPM5
support stays limited to autoparser/diff-analyzer changes.

* chat : refactor minicpm5 back to dedicated parser

* cont : simplify grammar

* cont : refactor

* cont : fixes

* cont : rename template to openbmb-MiniCPM5-1B.jinja

* cont : add message delimiters

* cont : fix tests

---------

Co-authored-by: zhangtao <zhangtao2@modelbest.cn>
Co-authored-by: 张涛 <>
2026-06-28 16:53:32 +02:00
20 changed files with 717 additions and 108 deletions
+14
View File
@@ -3296,6 +3296,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.reasoning_budget_message = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
add_opt(common_arg(
{"--reasoning-preserve"},
{"--no-reasoning-preserve"},
"preserve reasoning trace in the full history, not just the last assistant message (default: template default)\n"
"compatible with certain templates having 'supports_preserve_reasoning' capability\n"
"example: https://docs.z.ai/guides/capabilities/thinking-mode#preserved-thinking",
[](common_params & params, bool value) {
if (value) {
params.default_template_kwargs["preserve_reasoning"] = "true";
} else {
params.default_template_kwargs["preserve_reasoning"] = "false";
}
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING_PRESERVE"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
+155
View File
@@ -912,6 +912,10 @@ static std::string common_chat_template_direct_apply_impl(
if (inputs.add_generation_prompt) {
inp["add_generation_prompt"] = true;
}
if (inp.contains("preserve_reasoning") && inp["preserve_reasoning"].is_boolean()) {
bool enabled = inp["preserve_reasoning"].get<bool>();
jinja::caps_apply_preserve_reasoning(ctx, enabled);
}
jinja::global_from_json(ctx, inp, inputs.mark_input);
@@ -2376,6 +2380,149 @@ static void func_args_not_string(json & messages) {
}
// MiniCPM5 format:
// - Reasoning: <think>{reasoning}</think> (optional)
// - Tool calls: <function name="foo"><param name="bar">value</param></function>
static common_chat_params common_chat_params_init_minicpm5(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<function",
"<param",
"</function>",
"</param>",
"<think>",
"</think>",
};
data.thinking_start_tag = "<think>";
data.thinking_end_tag = "</think>";
data.message_delimiters = {
{ COMMON_CHAT_ROLE_ASSISTANT, "<|im_start|>assistant" },
{ COMMON_CHAT_ROLE_TOOL, "<|im_start|>user\n<tool_response>" },
{ COMMON_CHAT_ROLE_USER, "<|im_start|>user" },
{ COMMON_CHAT_ROLE_SYSTEM, "<|im_start|>system" },
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
if (inputs.has_continuation()) {
const auto & msg = inputs.continue_msg;
data.generation_prompt = "<|im_start|>assistant\n<think>\n" + msg.reasoning_content;
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
data.generation_prompt += "\n</think>\n\n" + msg.render_content();
}
data.prompt += data.generation_prompt;
}
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto generation_prompt = p.literal("<|im_start|>assistant\n");
auto reasoning = p.eps();
if (extract_reasoning) {
reasoning = ("<think>" << p.reasoning(p.until("</think>")) << "</think>") + p.space();
}
// Response format parser
if (has_response_format) {
return generation_prompt + reasoning + p.content(p.schema(p.json(), "response-format", inputs.json_schema));
}
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
// CDATA lets a value carry characters that would otherwise close the tag (e.g.
// </param>); capture the inner text only, excluding the CDATA markers.
auto string_value = p.choice({
p.literal("<![CDATA[") + p.ac(p.tool_arg_string_value(p.until("]]>")) + p.literal("]]>"), "]]>") + p.tool_arg_close(p.literal("</param>")),
p.negate(p.literal("<![CDATA[")) + p.ac(p.tool_arg_string_value(p.until("</param>")) + p.tool_arg_close(p.literal("</param>")), "</param>")
});
auto tool_choice = p.choice();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
const std::string name = function.at("name");
auto params = function.contains("parameters") ? function.at("parameters") : json::object();
auto args = p.eps();
if (params.contains("properties") && params.at("properties").is_object() && !params.at("properties").empty()) {
auto schema_info = common_schema_info();
schema_info.resolve_refs(params);
auto arg_choice = p.choice();
for (const auto & [prop_name, prop_schema] : params.at("properties").items()) {
auto value_parser = p.eps();
if (schema_info.resolves_to_string(prop_schema)) {
value_parser = string_value;
} else {
value_parser = p.tool_arg_json_value(
p.schema(p.json(), "tool-" + name + "-arg-" + prop_name + "-schema", prop_schema, false)
) + p.tool_arg_close(p.literal("</param>"));
}
auto arg_rule = p.tool_arg(
p.tool_arg_open(p.literal("<param name=\"") + p.tool_arg_name(p.literal(prop_name)) + p.literal("\">")) +
value_parser
);
arg_choice |= arg_rule;
}
args = p.zero_or_more(arg_choice + p.space());
}
auto tool_parser = p.tool(
p.tool_open(p.literal("<function name=\"") + p.tool_name(p.literal(name)) + p.literal("\">"))
<< p.tool_args(args)
<< p.tool_close(p.literal("</function>")));
tool_choice |= p.rule("tool-" + name, tool_parser);
});
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
auto tool_calls = p.trigger_rule("tool-call", p.repeat(tool_choice + p.space(), 1, max_calls));
auto content = p.content(p.until("<function"));
return generation_prompt + reasoning + content + tool_calls + p.end();
}
return generation_prompt + reasoning + p.content(p.rest()) + p.end();
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.contains("parameters") ? function.at("parameters") : json::object();
builder.resolve_refs(schema);
});
if (has_response_format) {
auto schema = inputs.json_schema;
builder.resolve_refs(schema);
}
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function" },
};
}
return data;
}
static json common_chat_extra_context() {
json ctx = json::object();
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
@@ -2468,6 +2615,14 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
return common_chat_params_init_gemma4(tmpl, params);
}
// MiniCPM5 - XML tool calls with <function name="..."><param name="...">...</param></function>
if (src.find("Tool usage guidelines:") != std::string::npos &&
src.find("<function name=\"") != std::string::npos &&
src.find("<param name=\"") != std::string::npos) {
LOG_DBG("Using specialized template: MiniCPM5\n");
return common_chat_params_init_minicpm5(tmpl, params);
}
return std::nullopt;
}
+44 -23
View File
@@ -16,22 +16,34 @@ using json = nlohmann::ordered_json;
namespace jinja {
using caps_json_fn = std::function<json()>;
using caps_analyze_fn = std::function<void(bool, value &, value &)>;
using caps_ctx_fn = std::function<void(context &)>;
using caps_analyze_fn = std::function<void(bool, value &, value &, const std::string &)>;
void caps_apply_preserve_reasoning(jinja::context & ctx, bool enabled) {
ctx.set_val("preserve_thinking", mk_val<value_bool>(enabled));
ctx.set_val("clear_thinking", mk_val<value_bool>(!enabled));
ctx.set_val("truncate_history_thinking", mk_val<value_bool>(!enabled));
}
static void caps_try_execute(jinja::program & prog,
const caps_json_fn & messages_fn,
const caps_ctx_fn & ctx_fn,
const caps_json_fn & tools_fn,
const caps_analyze_fn & analyze_fn) {
context ctx;
ctx.is_get_stats = true;
jinja::global_from_json(ctx, json{
{"messages", messages_fn()},
{"tools", tools_fn()},
{"tools", tools_fn ? tools_fn() : json::array()},
{"bos_token", ""},
{"eos_token", ""},
{"add_generation_prompt", true}
}, true);
if (ctx_fn) {
ctx_fn(ctx);
}
auto messages = ctx.get_val("messages");
auto tools = ctx.get_val("tools");
@@ -49,7 +61,7 @@ static void caps_try_execute(jinja::program & prog,
// ignore exceptions during capability analysis
}
analyze_fn(success, messages, tools);
analyze_fn(success, messages, tools, result);
}
// for debugging only
@@ -109,11 +121,9 @@ caps caps_get(jinja::program & prog) {
}
});
},
[&]() {
// tools
return json{nullptr};
},
[&](bool success, value & messages, value &) {
nullptr, // ctx_fn
nullptr, // tools_fn
[&](bool success, value & messages, value &, const std::string &) {
auto & content = messages->at(0)->at("content");
caps_print_stats(content, "messages[0].content");
if (has_op(content, "selectattr") || has_op(content, "array_access")) {
@@ -145,11 +155,9 @@ caps caps_get(jinja::program & prog) {
},
});
},
[&]() {
// tools
return json::array();
},
[&](bool, value & messages, value &) {
nullptr, // ctx_fn
nullptr, // tools_fn
[&](bool, value & messages, value &, const std::string &) {
auto & content = messages->at(0)->at("content");
caps_print_stats(content, "messages[0].content");
if (!content->stats.used) {
@@ -201,6 +209,7 @@ caps caps_get(jinja::program & prog) {
},
});
},
nullptr, // ctx_fn
[&]() {
// tools
return json::array({
@@ -224,7 +233,7 @@ caps caps_get(jinja::program & prog) {
},
});
},
[&](bool success, value & messages, value & tools) {
[&](bool success, value & messages, value & tools, const std::string &) {
if (!success) {
return; // Nothing can be inferred
}
@@ -293,6 +302,7 @@ caps caps_get(jinja::program & prog) {
},
});
},
nullptr, // ctx_fn
[&]() {
// tools
return json::array({
@@ -316,7 +326,7 @@ caps caps_get(jinja::program & prog) {
},
});
},
[&](bool success, value & messages, value & tools) {
[&](bool success, value & messages, value & tools, const std::string &) {
if (!success) {
result.supports_tool_calls = false;
result.supports_tools = false;
@@ -394,6 +404,7 @@ caps caps_get(jinja::program & prog) {
},
});
},
nullptr, // ctx_fn
[&]() {
// tools
return json::array({
@@ -417,7 +428,7 @@ caps caps_get(jinja::program & prog) {
},
});
},
[&](bool success, value & messages, value & /*tools*/) {
[&](bool success, value & messages, value &, const std::string &) {
if (!success) {
result.supports_parallel_tool_calls = false;
return;
@@ -438,11 +449,22 @@ caps caps_get(jinja::program & prog) {
JJ_DEBUG("%s\n", ">>> Running capability check: preserve reasoning");
// case: preserve reasoning content in chat history
const std::string reasoning_placeholder = "<REASONING_CONTENT_PLACEHOLDER>";
caps_try_execute(
prog,
[&]() {
// messages
return json::array({
{
{"role", "user"},
{"content", "User message"}
},
{
{"role", "assistant"},
{"content", "Assistant message"},
// check of reasoning_content deeper in the history, not just the last assistant message
{"reasoning_content", reasoning_placeholder}
},
{
{"role", "user"},
{"content", "User message"}
@@ -458,14 +480,13 @@ caps caps_get(jinja::program & prog) {
},
});
},
[&]() {
// tools
return json::array();
[&](context & ctx) {
caps_apply_preserve_reasoning(ctx, true);
},
[&](bool, value & messages, value &) {
auto & content = messages->at(1)->at("reasoning_content");
caps_print_stats(content, "messages[1].reasoning_content");
if (content->stats.used) {
nullptr, // tools_fn
[&](bool, value &, value &, const std::string & output) {
// note: we cannot use stats here because the reasoning_content may be used for "if" condition test, but not actually outputted in the final result
if (output.find(reasoning_placeholder) != std::string::npos) {
result.supports_preserve_reasoning = true;
}
}
+5 -1
View File
@@ -12,7 +12,9 @@ struct caps {
bool supports_tool_calls = true;
bool supports_system_role = true;
bool supports_parallel_tool_calls = true;
bool supports_preserve_reasoning = false; // support assistant message with reasoning_content
// supports preserve reasoning trace in the full history, not just the last assistant message
bool supports_preserve_reasoning = false;
// one of the 2 content capabilities must be true
bool supports_string_content = true;
@@ -29,4 +31,6 @@ struct caps {
caps caps_get(jinja::program & prog);
void caps_apply_preserve_reasoning(jinja::context & ctx, bool enabled);
} // namespace jinja
+44
View File
@@ -1108,6 +1108,50 @@ const func_builtins & value_array_t::get_builtins() const {
std::reverse(arr.begin(), arr.end());
return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
}},
{"min", [](const func_args & args) -> value {
args.ensure_count(1, 4);
args.ensure_vals<value_array>();
value val_case = args.get_kwarg_or_pos("case_sensitive", 1);
value attribute = args.get_kwarg_or_pos("attribute", 2);
if (!attribute->is_undefined()) {
throw not_implemented_exception("min: attribute not implemented");
}
// FIXME: min is currently always case sensitive
(void) val_case;
const auto & arr = args.get_pos(0)->as_array();
if (arr.empty()) {
return mk_val<value_undefined>();
}
value result = arr[0];
for (size_t i = 1; i < arr.size(); ++i) {
if (value_compare(arr[i], result, value_compare_op::lt)) {
result = arr[i];
}
}
return result;
}},
{"max", [](const func_args & args) -> value {
args.ensure_count(1, 4);
args.ensure_vals<value_array>();
value val_case = args.get_kwarg_or_pos("case_sensitive", 1);
value attribute = args.get_kwarg_or_pos("attribute", 2);
if (!attribute->is_undefined()) {
throw not_implemented_exception("max: attribute not implemented");
}
// FIXME: max is currently always case sensitive
(void) val_case;
const auto & arr = args.get_pos(0)->as_array();
if (arr.empty()) {
return mk_val<value_undefined>();
}
value result = arr[0];
for (size_t i = 1; i < arr.size(); ++i) {
if (value_compare(arr[i], result, value_compare_op::gt)) {
result = arr[i];
}
}
return result;
}},
{"unique", array_unique_not_implemented},
};
return builtins;
+3 -3
View File
@@ -73,7 +73,7 @@ class LlamaModel(TextModel):
target_num_layers = target_config["num_hidden_layers"]
target_layers = [2, target_num_layers // 2, target_num_layers - 3]
logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers)
self.gguf_writer.add_target_layers(target_layers)
# target_hidden_size: prefer eagle3 config, fallback to target config
if eagle3_raw_config.get("target_hidden_size") is not None:
@@ -83,12 +83,12 @@ class LlamaModel(TextModel):
target_hidden_size = target_config["hidden_size"]
src = "target model config"
logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
self.gguf_writer.add_target_hidden_size(target_hidden_size)
# norm_before_residual (RedHat-style eagle3 specific)
norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
self.gguf_writer.add_norm_before_residual(norm_before_residual)
def set_vocab(self):
# eagle3: use tokenizer from target model if provided
+10 -14
View File
@@ -643,21 +643,21 @@ class DFlashModel(Qwen3Model):
super().set_vocab()
self.dir_model = original_dir
mask_token_id = self.hparams.get("dflash_config", {}).get("mask_token_id")
if mask_token_id is not None:
self.gguf_writer.add_mask_token_id(mask_token_id)
def set_gguf_parameters(self):
super().set_gguf_parameters()
block_size = self.hparams.get("block_size", 16)
self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.block_size", block_size)
self.gguf_writer.add_block_size(block_size)
dflash_config = self.hparams.get("dflash_config", {})
target_layer_ids = dflash_config.get("target_layer_ids", [])
if target_layer_ids:
extract_layer_ids = [i + 1 for i in target_layer_ids]
self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", extract_layer_ids)
mask_token_id = dflash_config.get("mask_token_id", None)
if mask_token_id is not None:
self.gguf_writer.add_mask_token_id(mask_token_id)
self.gguf_writer.add_target_layers(extract_layer_ids)
use_sliding_window = self.hparams.get("use_sliding_window", False)
sliding_window = self.hparams.get("sliding_window")
@@ -667,13 +667,9 @@ class DFlashModel(Qwen3Model):
self.gguf_writer.add_sliding_window(sliding_window)
self.gguf_writer.add_sliding_window_pattern(is_swa)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name == "fc.weight":
yield (name, data_torch)
return
if name == "hidden_norm.weight":
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ENC_OUTPUT_NORM), data_torch)
return
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
if not name.startswith("model."):
name = "model." + name
yield from super().modify_tensors(data_torch, name, bid)
return super().filter_tensors((name, gen))
+1
View File
@@ -156,6 +156,7 @@ class Keys:
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
TARGET_LAYERS = "{arch}.target_layers"
TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size"
BLOCK_SIZE = "{arch}.block_size"
NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual"
class Attention:
+12
View File
@@ -940,6 +940,18 @@ class GGUFWriter:
def add_sliding_window(self, value: int) -> None:
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
def add_block_size(self, value: int) -> None:
self.add_uint32(Keys.LLM.BLOCK_SIZE.format(arch=self.arch), value)
def add_target_layers(self, value: Sequence[int]) -> None:
self.add_array(Keys.LLM.TARGET_LAYERS.format(arch=self.arch), value)
def add_target_hidden_size(self, value: int) -> None:
self.add_uint32(Keys.LLM.TARGET_HIDDEN_SIZE.format(arch=self.arch), value)
def add_norm_before_residual(self, value: bool) -> None:
self.add_bool(Keys.LLM.NORM_BEFORE_RESIDUAL.format(arch=self.arch), value)
def add_attention_scale(self, value: float) -> None:
self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
+5
View File
@@ -1283,6 +1283,11 @@ class TensorNameMap:
MODEL_TENSOR.ENC_OUTPUT_NORM: (
"encoder.final_layer_norm", # t5
"layer_norm", # neobert
"model.hidden_norm", # dflash
),
MODEL_TENSOR.FC: (
"model.fc", # dflash
),
MODEL_TENSOR.CLS: (
+179
View File
@@ -0,0 +1,179 @@
{{- bos_token }}{%- if tools %}
{%- set tool_definitions %}
{{- "# Tools\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson(ensure_ascii=False) }}
{%- endfor %}
{{- '\n</tools>\n\nTool usage guidelines:\n- You may call zero or more functions. If no function calls are needed, just answer normally and do not include any <function ... </function>.\n- When calling a function, return an XML object within <function ... </function> using:\n<function name="function-name"><param name="param-name">param-value</param></function>\n- param-value may be multi-line. If it contains <, & or newline characters, wrap it in a CDATA block: <param name="param-name"><![CDATA[...multi-line value...]]></param>' }}
{%- endset %}
{{- '<|im_start|>system\n' }}
{%- if messages[0].role == 'system' %}
{%- if '<tool_def_sep>' in messages[0].content %}
{{- messages[0].content.replace('<tool_def_sep>', tool_definitions) }}
{%- else %}
{{- messages[0].content + '\n\n' + tool_definitions }}
{%- endif %}
{%- else %}
{{- tool_definitions.lstrip() }}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- else %}
{%- if messages[0].role == 'system' %}
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endfor %}
{%- for message in messages %}
{%- if message.content is string %}
{%- set content = message.content %}
{%- else %}
{%- set content = '' %}
{%- endif %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set reasoning_content = '' %}
{%- if message.reasoning_content is string %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- if '</think>' in content %}
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- if message.tool_calls %}
{%- set content_parts = content.split('<tool_sep>') %}
{%- set processed_content = content_parts[0] %}
{%- set tool_calls_count = message.tool_calls|length %}
{%- set tool_sep_count = content_parts|length - 1 %}
{%- set min_count = [tool_calls_count, tool_sep_count]|min %}
{%- for i in range(1, content_parts|length) %}
{%- set tool_index = i - 1 %}
{%- if tool_index < tool_calls_count %}
{%- set tool_call = message.tool_calls[tool_index] %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{%- set single_tool_xml %}
{{- '<function name="' ~ tool_call.name ~ '">' }}
{%- if tool_call.arguments %}
{%- set args_dict = tool_call.arguments %}
{%- for param_name, param_value in args_dict.items() %}
{{- '<param name="' ~ param_name ~ '">' }}
{%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
{{- '<![CDATA[' + param_value + ']]>' }}
{%- else %}
{{- param_value }}
{%- endif %}
{{- '</param>' }}
{%- endfor %}
{%- endif %}
{{- '</function>' }}
{%- endset %}
{%- set processed_content = processed_content + single_tool_xml + content_parts[i] %}
{%- else %}
{%- set processed_content = processed_content + content_parts[i] %}
{%- endif %}
{%- endfor %}
{%- if tool_calls_count > tool_sep_count %}
{%- for remaining_index in range(tool_sep_count, tool_calls_count) %}
{%- set tool_call = message.tool_calls[remaining_index] %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{%- set remaining_tool_xml %}
{{- '<function name="' ~ tool_call.name ~ '">' }}
{%- if tool_call.arguments %}
{%- set args_dict = tool_call.arguments %}
{%- for param_name, param_value in args_dict.items() %}
{{- '<param name="' ~ param_name ~ '">' }}
{%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
{{- '<![CDATA[' + param_value + ']]>' }}
{%- else %}
{{- param_value }}
{%- endif %}
{{- '</param>' }}
{%- endfor %}
{%- endif %}
{{- '</function>' }}
{%- endset %}
{%- set processed_content = processed_content + remaining_tool_xml %}
{%- endfor %}
{%- endif %}
{%- set content = processed_content %}
{%- endif %}
{%- if loop.index0 > ns.last_query_index %}
{%- if reasoning_content %}
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls and not has_tool_sep %}
{%- for tool_call in message.tool_calls %}
{%- if (loop.first and content) or (not loop.first) %}
{{- '\n' }}
{%- endif %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '<function name="' ~ tool_call.name ~ '">' }}
{%- if tool_call.arguments %}
{%- set args_dict = tool_call.arguments %}
{%- for param_name, param_value in args_dict.items() %}
{{- '<param name="' ~ param_name ~ '">' }}
{%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
{{- '<![CDATA[' + param_value + ']]>' }}
{%- else %}
{{- param_value }}
{%- endif %}
{{- '</param>' }}
{%- endfor %}
{%- endif %}
{{- '</function>' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{%- if message.content is string %}
{{- content }}
{%- else %}
{{- message.content | tojson(ensure_ascii=False) }}
{%- endif %}
{{- '\n</tool_response>' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if enable_thinking is defined %}
{%- if enable_thinking is false %}
{{- '<think>\n\n</think>\n\n' }}
{%- elif enable_thinking is true %}
{{- '<think>\n' }}
{%- endif %}
{%- endif %}
{%- endif %}
+78
View File
@@ -5593,6 +5593,77 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.expect_content("Hello, world!\nWhat's up?")
.run();
}
// MiniCPM5 - XML tool calls with <function name="..."><param name="...">...</param></function>
{
auto tst = peg_tester("models/templates/openbmb-MiniCPM5-1B.jinja", detailed_debug);
tst.test("Hello, world!\nWhat's up?")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.expect(message_assist)
.run();
tst.test(R"(<function name="python"><param name="code">print('Hello, World!')</param></function>)")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ python_tool })
.expect_tool_calls({ { "python", R"#({"code": "print('Hello, World!')"})#", {} } })
.run();
tst.test(R"(<function name="empty_args"></function>)")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ empty_args_tool })
.expect(simple_assist_msg("", "", "empty_args", "{}"))
.run();
tst.test(R"(<function name="python"><param name="code">print('x')</param></function>)")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.parallel_tool_calls(true)
.tools({ python_tool })
.expect_tool_calls({ { "python", R"#({"code": "print('x')"})#", {} } })
.run();
// CDATA lets a string value carry characters that would otherwise close the tag.
tst.test(R"(<function name="html"><param name="markup"><![CDATA[<a href="/x">hi</a> </param>]]></param></function>)")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ html_tool })
.expect_tool_calls({ { "html", R"#({"markup": "<a href=\"/x\">hi</a> </param>"})#", {} } })
.run();
tst.test(R"(I'm thinking</think><function name="python"><param name="code">print('hey')</param></function>)")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ python_tool })
.expect_reasoning("I'm thinking")
.expect_tool_calls({ { "python", R"#({"code": "print('hey')"})#", {} } })
.run();
tst.test(R"(<function name="python"><param name="code">print('x')</param></function>
<function name="python"><param name="code">print('y')</param></function>)")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.parallel_tool_calls(true)
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "print('x')"})#", {} },
{ "python", R"#({"code": "print('y')"})#", {} },
})
.run();
tst.test(" thinking</think>Hello, world!\nWhat's up?")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.messages({ message_user, message_assist_prefill_reasoning })
.add_generation_prompt(false)
.continue_final_message(COMMON_CHAT_CONTINUATION_REASONING)
.expect_reasoning("I'm thinking")
.expect_content("Hello, world!\nWhat's up?")
.run();
}
}
static void test_template_generation_prompt() {
@@ -5740,6 +5811,13 @@ static void test_template_generation_prompt() {
check(tmpls, continuation_content(), "<Assistant><think>I'm thinking</think>Hello, ");
check(tmpls, continuation_reasoning(), "<Assistant><think>I'm");
}
{
auto tmpls = read_templates("models/templates/openbmb-MiniCPM5-1B.jinja");
check(tmpls, basic(), "<|im_start|>assistant\n<think>\n");
check(tmpls, continuation_content(), "<|im_start|>assistant\n<think>\nI'm thinking\n</think>\n\nHello, ");
check(tmpls, continuation_reasoning(), "<|im_start|>assistant\n<think>\nI'm");
}
}
// Test the developer role to system workaround with a simple mock template
+30
View File
@@ -1584,6 +1584,36 @@ static void test_array_methods(testing & t) {
"6"
);
test_template(t, "array|min",
"{{ [tool_calls_count, tool_sep_count]|min }}",
{{"tool_calls_count", 2}, {"tool_sep_count", 1}},
"1"
);
test_template(t, "array|max",
"{{ [tool_calls_count, tool_sep_count]|max }}",
{{"tool_calls_count", 2}, {"tool_sep_count", 1}},
"2"
);
test_template(t, "array|min attribute",
"{{ items|min(attribute='x') }}",
{{"items", json::array({
json({{"x", 2}}),
json({{"x", 1}}),
})}},
"{'x': 1}"
);
test_template(t, "array|max attribute",
"{{ items|max(attribute='x') }}",
{{"items", json::array({
json({{"x", 2}}),
json({{"x", 1}}),
})}},
"{'x': 2}"
);
// not used by any chat templates
// test_template(t, "array.insert()",
// "{% set _ = arr.insert(1, 'x') %}{{ arr|join(',') }}",
+15
View File
@@ -1538,6 +1538,19 @@ private:
/* media_path */ params_base.media_path,
/* force_pure_content */ params_base.force_pure_content_parser
};
{
auto caps = common_chat_templates_get_caps(chat_params.tmpls.get());
auto it = params_base.default_template_kwargs.find("preserve_reasoning");
bool supported = caps.at("supports_preserve_reasoning");
bool enabled = it != params_base.default_template_kwargs.end();
if (supported && !enabled) {
SRV_INF("%s", "chat template supports preserving reasoning, consider enabling it via --reasoning-preserve\n");
}
if (!supported && enabled) {
SRV_WRN("%s", "chat template does NOT support preserving reasoning, --reasoning-preserve has no effect\n");
}
}
}
return true;
@@ -2450,6 +2463,8 @@ private:
server_slot * slot = get_slot_by_cmpl_id(task.params.control_cmpl_id);
if (slot == nullptr) {
SRV_WRN("control %s on unknown completion id=%s, no live slot\n",
task.params.control_action.c_str(), task.params.control_cmpl_id.c_str());
res->success = false;
res->message = "no active completion for this id";
queue_results.send(std::move(res));
+4 -1
View File
@@ -1983,7 +1983,10 @@ void server_models_routes::init_routes() {
cli.set_read_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
cli.set_write_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
auto resp = cli.Delete(child_path.c_str());
(void) resp; // best effort, 404 and network errors are equivalent to no op
(void) resp; // the child logs its own miss when the session is unknown there
} else {
SRV_WRN("router stop for unknown conv_id=%s, no owning child in the conv map\n",
conv_id.c_str());
}
// drop the tracking entry, the session is being torn down
models.conv_models.forget(conv_id);
+7
View File
@@ -218,6 +218,13 @@ void stream_session_manager::evict_and_cancel(const std::string & conversation_i
std::unique_lock<std::shared_mutex> lock(map_mu);
auto it = sessions.find(conversation_id);
if (it == sessions.end()) {
std::string live;
for (const auto & kv : sessions) {
if (!live.empty()) live += ", ";
live += kv.first;
}
SRV_WRN("stop on unknown stream session, conv_id=%s matched nothing, %zu live: [%s]\n",
conversation_id.c_str(), sessions.size(), live.c_str());
return;
}
s = it->second;
@@ -33,7 +33,7 @@
{#if !readonly && onRemove}
<div
class="absolute top-10 right-2 flex items-center justify-center opacity-0 transition-opacity group-focus-within:opacity-100 group-hover:opacity-100"
class="absolute top-10 right-2 flex items-center justify-center opacity-0 transition-opacity group-hover:opacity-100"
>
<ActionIcon icon={X} tooltip="Remove" stopPropagationOnClick onclick={() => onRemove?.()} />
</div>
@@ -56,7 +56,7 @@
<div class="relative flex h-6 items-center justify-between">
<div class="right-0 flex items-center gap-2 opacity-100 transition-opacity">
<div
class="pointer-events-auto inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-focus-within:opacity-100 group-hover:opacity-100"
class="pointer-events-auto inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-hover:opacity-100"
>
<ActionIcon icon={Edit} tooltip="Edit" onclick={editCtx.handleEdit} />
<ActionIcon icon={Trash2} tooltip="Delete" onclick={onDelete} />
@@ -39,6 +39,7 @@
depth = 0
}: Props = $props();
let renderActionsDropdown = $state(false);
let dropdownOpen = $state(false);
let isLoading = $derived(getAllLoadingChats().includes(conversation.id));
@@ -70,10 +71,26 @@
}
}
function handleMouseLeave() {
if (!dropdownOpen) {
renderActionsDropdown = false;
}
}
function handleMouseOver() {
renderActionsDropdown = true;
}
function handleSelect() {
onSelect?.(conversation.id);
}
$effect(() => {
if (!dropdownOpen) {
renderActionsDropdown = false;
}
});
onMount(() => {
document.addEventListener('edit-active-conversation', handleGlobalEditEvent as EventListener);
@@ -86,19 +103,23 @@
});
</script>
<div
class="conversation-item group relative flex min-h-9 w-full items-center justify-between space-x-3 rounded-lg py-1.5 transition-colors hover:bg-foreground/10 {isActive
<!-- svelte-ignore a11y_mouse_events_have_key_events -->
<button
class="group flex min-h-9 w-full cursor-pointer items-center justify-between space-x-3 rounded-lg py-1.5 text-left transition-colors hover:bg-foreground/10 {isActive
? 'bg-foreground/5 text-accent-foreground'
: ''} px-3"
onclick={handleSelect}
onmouseover={handleMouseOver}
onmouseleave={handleMouseLeave}
onfocusin={handleMouseOver}
onfocusout={(e) => {
if (!e.currentTarget.contains(e.relatedTarget as Node | null)) {
handleMouseLeave();
}
}}
>
<button
class="absolute inset-0 z-0 cursor-pointer rounded-lg focus:outline-none focus-visible:ring-2 focus-visible:ring-ring"
onclick={handleSelect}
aria-label={conversation.name}
>
</button>
<div
class="pointer-events-none relative z-10 flex min-w-0 flex-1 items-center gap-2"
class="flex min-w-0 flex-1 items-center gap-2"
style:padding-left="{depth * FORK_TREE_DEPTH_PADDING}px"
>
{#if depth > 0}
@@ -109,7 +130,7 @@
<a
{...props}
href={RouterService.chat(conversation.forkedFromConversationId)}
class="pointer-events-auto flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
class="flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
>
<GitBranch class="h-3.5 w-3.5" />
</a>
@@ -125,15 +146,18 @@
{#if isLoading}
<Tooltip.Root>
<Tooltip.Trigger>
<button
class="stop-button pointer-events-auto flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
<div
class="stop-button flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
onclick={handleStop}
onkeydown={(e) => e.key === 'Enter' && handleStop(e)}
role="button"
tabindex="0"
aria-label="Stop generation"
>
<Loader2 class="loading-icon h-3.5 w-3.5 animate-spin" />
<Square class="stop-icon hidden h-3 w-3 fill-current text-destructive" />
</button>
</div>
</Tooltip.Trigger>
<Tooltip.Content>
@@ -145,50 +169,52 @@
<TruncatedText text={conversation.name} class="text-sm font-medium" showTooltip={false} />
</div>
<div class="actions pointer-events-auto relative z-20 flex items-center">
<DropdownMenuActions
triggerIcon={MoreHorizontal}
triggerTooltip="More actions"
bind:open={dropdownOpen}
actions={[
{
icon: conversation.pinned ? PinOff : Pin,
label: conversation.pinned ? 'Unpin' : 'Pin',
onclick: (e: Event) => {
e.stopPropagation();
handleTogglePin();
}
},
{
icon: Pencil,
label: 'Edit',
onclick: handleEdit,
shortcut: ['shift', 'cmd', 'e']
},
{
icon: Download,
label: 'Export',
onclick: (e: Event) => {
e.stopPropagation();
conversationsStore.downloadConversation(conversation.id);
{#if renderActionsDropdown}
<div class="actions flex items-center">
<DropdownMenuActions
triggerIcon={MoreHorizontal}
triggerTooltip="More actions"
bind:open={dropdownOpen}
actions={[
{
icon: conversation.pinned ? PinOff : Pin,
label: conversation.pinned ? 'Unpin' : 'Pin',
onclick: (e: Event) => {
e.stopPropagation();
handleTogglePin();
}
},
shortcut: ['shift', 'cmd', 's']
},
{
icon: Trash2,
label: 'Delete',
onclick: handleDelete,
variant: 'destructive',
shortcut: ['shift', 'cmd', 'd'],
separator: true
}
]}
/>
</div>
</div>
{
icon: Pencil,
label: 'Edit',
onclick: handleEdit,
shortcut: ['shift', 'cmd', 'e']
},
{
icon: Download,
label: 'Export',
onclick: (e: Event) => {
e.stopPropagation();
conversationsStore.downloadConversation(conversation.id);
},
shortcut: ['shift', 'cmd', 's']
},
{
icon: Trash2,
label: 'Delete',
onclick: handleDelete,
variant: 'destructive',
shortcut: ['shift', 'cmd', 'd'],
separator: true
}
]}
/>
</div>
{/if}
</button>
<style>
.conversation-item {
button {
:global([data-slot='dropdown-menu-trigger']:not([data-state='open'])) {
opacity: 0;
}
@@ -213,8 +239,7 @@
}
}
&:is(:hover) .stop-button,
&:focus-within .stop-button {
&:is(:hover) .stop-button {
:global(.stop-icon) {
display: block;
}
+28 -8
View File
@@ -154,7 +154,13 @@ class ChatStore {
});
if (convId === conversationsStore.activeConversation?.id) this.currentResponse = response;
}
private clearChatStreaming(convId: string): void {
private clearChatStreaming(convId: string, messageId?: string): void {
// session aware: a stale generation must not wipe a newer one's streaming state on the
// same conversation, that would drop the frozen stop identity and stop the wrong session
if (messageId !== undefined) {
const cur = this.chatStreamingStates.get(convId);
if (cur && cur.messageId !== messageId) return;
}
this.chatStreamingStates.delete(convId);
if (convId === conversationsStore.activeConversation?.id) this.currentResponse = '';
}
@@ -1055,11 +1061,14 @@ class ChatStore {
modelOverride?: string | null,
firstUserMessageContent?: string
): Promise<void> {
let effectiveModel = modelOverride;
// the ::model suffix in the stream identity is only for router mode, where it routes to the
// owning child. in single-model mode the identity stays the bare conv id so that attach, stop
// and reattach all agree, regardless of fresh send vs regenerate passing a resolved model
let effectiveModel: string | null | undefined = undefined;
if (isRouterMode() && !effectiveModel) {
if (isRouterMode()) {
const conversationModel = this.getConversationModel(allMessages);
effectiveModel = selectedModelName() || conversationModel;
effectiveModel = modelOverride || selectedModelName() || conversationModel;
}
if (isRouterMode() && effectiveModel) {
@@ -1074,6 +1083,9 @@ class ChatStore {
let resolvedModel: string | null = null;
let modelPersisted = false;
const convId = assistantMessage.convId;
// freeze the POST identity from t0 so a stop cancels with the exact session key,
// never a stale or empty model resolved later
this.setChatStreaming(convId, streamedContent, currentMessageId, effectiveModel);
const recordModel = (modelName: string | null | undefined, persistImmediately = true): void => {
if (!modelName) return;
@@ -1103,7 +1115,7 @@ class ChatStore {
};
const updateStreamingUI = () => {
this.setChatStreaming(convId, streamedContent, currentMessageId);
this.setChatStreaming(convId, streamedContent, currentMessageId, effectiveModel);
const idx = conversationsStore.findMessageIndex(currentMessageId);
conversationsStore.updateMessageAtIndex(idx, { content: streamedContent });
};
@@ -1111,7 +1123,7 @@ class ChatStore {
const cleanupStreamingState = () => {
this.setStreamingActive(false);
this.setChatLoading(convId, false);
this.clearChatStreaming(convId);
this.clearChatStreaming(convId, currentMessageId);
this.setProcessingState(convId, null);
};
@@ -1128,7 +1140,7 @@ class ChatStore {
onReasoningChunk: (chunk: string) => {
streamedReasoningContent += chunk;
// mark streaming state so a stop mid-thinking can persist the partial reasoning
this.setChatStreaming(convId, streamedContent, currentMessageId);
this.setChatStreaming(convId, streamedContent, currentMessageId, effectiveModel);
const idx = conversationsStore.findMessageIndex(currentMessageId);
conversationsStore.updateMessageAtIndex(idx, {
reasoningContent: streamedReasoningContent
@@ -1405,7 +1417,7 @@ class ChatStore {
// detached drain keeps producing tokens until eos or max_tokens. use the frozen identity
// captured when the session started, not the live dropdown
const streamStateForStop = this.chatStreamingStates.get(convId);
const modelForStop = streamStateForStop?.model ?? selectedModelName();
const modelForStop = streamStateForStop?.model;
void ChatService.cancelServerStream(convId, modelForStop);
this.abortRequest(convId);
this.setChatLoading(convId, false);
@@ -1846,6 +1858,14 @@ class ChatStore {
updateStreamingContent(originalContent + appendedContent);
this.setChatReasoning(msg.convId, false);
},
onCompletionId: (id: string) => {
if (!id) return;
// refresh the message id so a later skip targets the live slot after a continue
conversationsStore.updateMessageAtIndex(conversationsStore.findMessageIndex(msg.id), {
completionId: id
});
DatabaseService.updateMessage(msg.id, { completionId: id }).catch(() => {});
},
onReasoningChunk: (chunk: string) => {
appendedReasoning += chunk;
hasReceivedContent = true;