jinja, chat: add --reasoning-preserve flag (#25105 )

* jinja, chat: add --reasoning-preserve flag * correct help message
Revert "ui: fix accessibility for hover-gated interactive elements assisted by claude(in debugging and tests) (#24727 )" (#25098 )
2026-06-29 02:33:03 +02:00 · 2026-06-28 23:33:51 +02:00 · 2026-06-28 21:30:03 +02:00 · 2026-06-28 21:06:43 +02:00 · 2026-06-28 20:31:48 +02:00
16 changed files with 235 additions and 108 deletions
@@ -3296,6 +3296,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.sampling.reasoning_budget_message = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
+    add_opt(common_arg(
+        {"--reasoning-preserve"},
+        {"--no-reasoning-preserve"},
+        "preserve reasoning trace in the full history, not just the last assistant message (default: template default)\n"
+        "compatible with certain templates having 'supports_preserve_reasoning' capability\n"
+        "example: https://docs.z.ai/guides/capabilities/thinking-mode#preserved-thinking",
+        [](common_params & params, bool value) {
+            if (value) {
+                params.default_template_kwargs["preserve_reasoning"] = "true";
+            } else {
+                params.default_template_kwargs["preserve_reasoning"] = "false";
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING_PRESERVE"));
    add_opt(common_arg(
        {"--chat-template"}, "JINJA_TEMPLATE",
        string_format(
@@ -912,6 +912,10 @@ static std::string common_chat_template_direct_apply_impl(
    if (inputs.add_generation_prompt) {
        inp["add_generation_prompt"] = true;
    }
+    if (inp.contains("preserve_reasoning") && inp["preserve_reasoning"].is_boolean()) {
+        bool enabled = inp["preserve_reasoning"].get<bool>();
+        jinja::caps_apply_preserve_reasoning(ctx, enabled);
+    }

    jinja::global_from_json(ctx, inp, inputs.mark_input);

@@ -16,22 +16,34 @@ using json = nlohmann::ordered_json;
 namespace jinja {

 using caps_json_fn = std::function<json()>;
-using caps_analyze_fn = std::function<void(bool, value &, value &)>;
+using caps_ctx_fn = std::function<void(context &)>;
+using caps_analyze_fn = std::function<void(bool, value &, value &, const std::string &)>;
+
+void caps_apply_preserve_reasoning(jinja::context & ctx, bool enabled) {
+    ctx.set_val("preserve_thinking",         mk_val<value_bool>(enabled));
+    ctx.set_val("clear_thinking",            mk_val<value_bool>(!enabled));
+    ctx.set_val("truncate_history_thinking", mk_val<value_bool>(!enabled));
+}

 static void caps_try_execute(jinja::program & prog,
                             const caps_json_fn & messages_fn,
+                             const caps_ctx_fn & ctx_fn,
                             const caps_json_fn & tools_fn,
                             const caps_analyze_fn & analyze_fn) {
    context ctx;
    ctx.is_get_stats = true;
    jinja::global_from_json(ctx, json{
        {"messages", messages_fn()},
-        {"tools", tools_fn()},
+        {"tools", tools_fn ? tools_fn() : json::array()},
        {"bos_token", ""},
        {"eos_token", ""},
        {"add_generation_prompt", true}
    }, true);

+    if (ctx_fn) {
+        ctx_fn(ctx);
+    }
+
    auto messages = ctx.get_val("messages");
    auto tools = ctx.get_val("tools");

@@ -49,7 +61,7 @@ static void caps_try_execute(jinja::program & prog,
        // ignore exceptions during capability analysis
    }

-    analyze_fn(success, messages, tools);
+    analyze_fn(success, messages, tools, result);
 }

 // for debugging only
@@ -109,11 +121,9 @@ caps caps_get(jinja::program & prog) {
                }
            });
        },
-        [&]() {
-            // tools
-            return json{nullptr};
-        },
-        [&](bool success, value & messages, value &) {
+        nullptr, // ctx_fn
+        nullptr, // tools_fn
+        [&](bool success, value & messages, value &, const std::string &) {
            auto & content = messages->at(0)->at("content");
            caps_print_stats(content, "messages[0].content");
            if (has_op(content, "selectattr") || has_op(content, "array_access")) {
@@ -145,11 +155,9 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
-        [&]() {
-            // tools
-            return json::array();
-        },
-        [&](bool, value & messages, value &) {
+        nullptr, // ctx_fn
+        nullptr, // tools_fn
+        [&](bool, value & messages, value &, const std::string &) {
            auto & content = messages->at(0)->at("content");
            caps_print_stats(content, "messages[0].content");
            if (!content->stats.used) {
@@ -201,6 +209,7 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
+        nullptr, // ctx_fn
        [&]() {
            // tools
            return json::array({
@@ -224,7 +233,7 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
-        [&](bool success, value & messages, value & tools) {
+        [&](bool success, value & messages, value & tools, const std::string &) {
            if (!success) {
                return; // Nothing can be inferred
            }
@@ -293,6 +302,7 @@ caps caps_get(jinja::program & prog) {
                    },
                });
            },
+            nullptr, // ctx_fn
            [&]() {
                // tools
                return json::array({
@@ -316,7 +326,7 @@ caps caps_get(jinja::program & prog) {
                    },
                });
            },
-            [&](bool success, value & messages, value & tools) {
+            [&](bool success, value & messages, value & tools, const std::string &) {
                if (!success) {
                    result.supports_tool_calls = false;
                    result.supports_tools = false;
@@ -394,6 +404,7 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
+        nullptr, // ctx_fn
        [&]() {
            // tools
            return json::array({
@@ -417,7 +428,7 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
-        [&](bool success, value & messages, value & /*tools*/) {
+        [&](bool success, value & messages, value &, const std::string &) {
            if (!success) {
                result.supports_parallel_tool_calls = false;
                return;
@@ -438,11 +449,22 @@ caps caps_get(jinja::program & prog) {
    JJ_DEBUG("%s\n", ">>> Running capability check: preserve reasoning");

    // case: preserve reasoning content in chat history
+    const std::string reasoning_placeholder = "<REASONING_CONTENT_PLACEHOLDER>";
    caps_try_execute(
        prog,
        [&]() {
            // messages
            return json::array({
+                {
+                    {"role", "user"},
+                    {"content", "User message"}
+                },
+                {
+                    {"role", "assistant"},
+                    {"content", "Assistant message"},
+                    // check of reasoning_content deeper in the history, not just the last assistant message
+                    {"reasoning_content", reasoning_placeholder}
+                },
                {
                    {"role", "user"},
                    {"content", "User message"}
@@ -458,14 +480,13 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
-        [&]() {
-            // tools
-            return json::array();
+        [&](context & ctx) {
+            caps_apply_preserve_reasoning(ctx, true);
        },
-        [&](bool, value & messages, value &) {
-            auto & content = messages->at(1)->at("reasoning_content");
-            caps_print_stats(content, "messages[1].reasoning_content");
-            if (content->stats.used) {
+        nullptr, // tools_fn
+        [&](bool, value &, value &, const std::string & output) {
+            // note: we cannot use stats here because the reasoning_content may be used for "if" condition test, but not actually outputted in the final result
+            if (output.find(reasoning_placeholder) != std::string::npos) {
                result.supports_preserve_reasoning = true;
            }
        }
@@ -12,7 +12,9 @@ struct caps {
    bool supports_tool_calls = true;
    bool supports_system_role = true;
    bool supports_parallel_tool_calls = true;
-    bool supports_preserve_reasoning = false; // support assistant message with reasoning_content
+
+    // supports preserve reasoning trace in the full history, not just the last assistant message
+    bool supports_preserve_reasoning = false;

    // one of the 2 content capabilities must be true
    bool supports_string_content = true;
@@ -29,4 +31,6 @@ struct caps {

 caps caps_get(jinja::program & prog);

+void caps_apply_preserve_reasoning(jinja::context & ctx, bool enabled);
+
 } // namespace jinja
@@ -73,7 +73,7 @@ class LlamaModel(TextModel):
            target_num_layers = target_config["num_hidden_layers"]
            target_layers = [2, target_num_layers // 2, target_num_layers - 3]
            logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
-            self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers)
+            self.gguf_writer.add_target_layers(target_layers)

            # target_hidden_size: prefer eagle3 config, fallback to target config
            if eagle3_raw_config.get("target_hidden_size") is not None:
@@ -83,12 +83,12 @@ class LlamaModel(TextModel):
                target_hidden_size = target_config["hidden_size"]
                src = "target model config"
            logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
-            self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
+            self.gguf_writer.add_target_hidden_size(target_hidden_size)

            # norm_before_residual (RedHat-style eagle3 specific)
            norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
            logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
-            self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
+            self.gguf_writer.add_norm_before_residual(norm_before_residual)

    def set_vocab(self):
        # eagle3: use tokenizer from target model if provided
@@ -643,21 +643,21 @@ class DFlashModel(Qwen3Model):
        super().set_vocab()
        self.dir_model = original_dir

+        mask_token_id = self.hparams.get("dflash_config", {}).get("mask_token_id")
+        if mask_token_id is not None:
+            self.gguf_writer.add_mask_token_id(mask_token_id)
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()

        block_size = self.hparams.get("block_size", 16)
-        self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.block_size", block_size)
+        self.gguf_writer.add_block_size(block_size)
        dflash_config = self.hparams.get("dflash_config", {})

        target_layer_ids = dflash_config.get("target_layer_ids", [])
        if target_layer_ids:
            extract_layer_ids = [i + 1 for i in target_layer_ids]
-            self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", extract_layer_ids)
-
-        mask_token_id = dflash_config.get("mask_token_id", None)
-        if mask_token_id is not None:
-            self.gguf_writer.add_mask_token_id(mask_token_id)
+            self.gguf_writer.add_target_layers(extract_layer_ids)

        use_sliding_window = self.hparams.get("use_sliding_window", False)
        sliding_window = self.hparams.get("sliding_window")
@@ -667,13 +667,9 @@ class DFlashModel(Qwen3Model):
            self.gguf_writer.add_sliding_window(sliding_window)
            self.gguf_writer.add_sliding_window_pattern(is_swa)

-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name == "fc.weight":
-            yield (name, data_torch)
-            return
-        if name == "hidden_norm.weight":
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ENC_OUTPUT_NORM), data_torch)
-            return
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
        if not name.startswith("model."):
            name = "model." + name
-        yield from super().modify_tensors(data_torch, name, bid)
+        return super().filter_tensors((name, gen))
@@ -156,6 +156,7 @@ class Keys:
        DENSE_FEAT_OUT_SIZE               = "{arch}.{dense}_feat_out"
        TARGET_LAYERS                     = "{arch}.target_layers"
        TARGET_HIDDEN_SIZE                = "{arch}.target_hidden_size"
+        BLOCK_SIZE                        = "{arch}.block_size"
        NORM_BEFORE_RESIDUAL              = "{arch}.norm_before_residual"

    class Attention:
@@ -940,6 +940,18 @@ class GGUFWriter:
    def add_sliding_window(self, value: int) -> None:
        self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)

+    def add_block_size(self, value: int) -> None:
+        self.add_uint32(Keys.LLM.BLOCK_SIZE.format(arch=self.arch), value)
+
+    def add_target_layers(self, value: Sequence[int]) -> None:
+        self.add_array(Keys.LLM.TARGET_LAYERS.format(arch=self.arch), value)
+
+    def add_target_hidden_size(self, value: int) -> None:
+        self.add_uint32(Keys.LLM.TARGET_HIDDEN_SIZE.format(arch=self.arch), value)
+
+    def add_norm_before_residual(self, value: bool) -> None:
+        self.add_bool(Keys.LLM.NORM_BEFORE_RESIDUAL.format(arch=self.arch), value)
+
    def add_attention_scale(self, value: float) -> None:
        self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)

@@ -1283,6 +1283,11 @@ class TensorNameMap:
        MODEL_TENSOR.ENC_OUTPUT_NORM: (
            "encoder.final_layer_norm", # t5
            "layer_norm",               # neobert
+            "model.hidden_norm",        # dflash
+        ),
+
+        MODEL_TENSOR.FC: (
+            "model.fc", # dflash
        ),

        MODEL_TENSOR.CLS: (
@@ -1538,6 +1538,19 @@ private:
                /* media_path            */ params_base.media_path,
                /* force_pure_content    */ params_base.force_pure_content_parser
            };
+
+            {
+                auto caps = common_chat_templates_get_caps(chat_params.tmpls.get());
+                auto it = params_base.default_template_kwargs.find("preserve_reasoning");
+                bool supported = caps.at("supports_preserve_reasoning");
+                bool enabled = it != params_base.default_template_kwargs.end();
+                if (supported && !enabled) {
+                    SRV_INF("%s", "chat template supports preserving reasoning, consider enabling it via --reasoning-preserve\n");
+                }
+                if (!supported && enabled) {
+                    SRV_WRN("%s", "chat template does NOT support preserving reasoning, --reasoning-preserve has no effect\n");
+                }
+            }
        }

        return true;
@@ -2450,6 +2463,8 @@ private:

                    server_slot * slot = get_slot_by_cmpl_id(task.params.control_cmpl_id);
                    if (slot == nullptr) {
+                        SRV_WRN("control %s on unknown completion id=%s, no live slot\n",
+                                task.params.control_action.c_str(), task.params.control_cmpl_id.c_str());
                        res->success = false;
                        res->message = "no active completion for this id";
                        queue_results.send(std::move(res));
@@ -1983,7 +1983,10 @@ void server_models_routes::init_routes() {
            cli.set_read_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
            cli.set_write_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
            auto resp = cli.Delete(child_path.c_str());
-            (void) resp; // best effort, 404 and network errors are equivalent to no op
+            (void) resp; // the child logs its own miss when the session is unknown there
+        } else {
+            SRV_WRN("router stop for unknown conv_id=%s, no owning child in the conv map\n",
+                    conv_id.c_str());
        }
        // drop the tracking entry, the session is being torn down
        models.conv_models.forget(conv_id);
@@ -218,6 +218,13 @@ void stream_session_manager::evict_and_cancel(const std::string & conversation_i
        std::unique_lock<std::shared_mutex> lock(map_mu);
        auto it = sessions.find(conversation_id);
        if (it == sessions.end()) {
+            std::string live;
+            for (const auto & kv : sessions) {
+                if (!live.empty()) live += ", ";
+                live += kv.first;
+            }
+            SRV_WRN("stop on unknown stream session, conv_id=%s matched nothing, %zu live: [%s]\n",
+                    conversation_id.c_str(), sessions.size(), live.c_str());
            return;
        }
        s = it->second;
@@ -33,7 +33,7 @@

 	{#if !readonly && onRemove}
 		<div
-			class="absolute top-10 right-2 flex items-center justify-center opacity-0 transition-opacity group-focus-within:opacity-100 group-hover:opacity-100"
+			class="absolute top-10 right-2 flex items-center justify-center opacity-0 transition-opacity group-hover:opacity-100"
 		>
 			<ActionIcon icon={X} tooltip="Remove" stopPropagationOnClick onclick={() => onRemove?.()} />
 		</div>
@@ -56,7 +56,7 @@
 			<div class="relative flex h-6 items-center justify-between">
 				<div class="right-0 flex items-center gap-2 opacity-100 transition-opacity">
 					<div
-						class="pointer-events-auto inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-focus-within:opacity-100 group-hover:opacity-100"
+						class="pointer-events-auto inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-hover:opacity-100"
 					>
 						<ActionIcon icon={Edit} tooltip="Edit" onclick={editCtx.handleEdit} />
 						<ActionIcon icon={Trash2} tooltip="Delete" onclick={onDelete} />
@@ -39,6 +39,7 @@
 		depth = 0
 	}: Props = $props();

+	let renderActionsDropdown = $state(false);
 	let dropdownOpen = $state(false);

 	let isLoading = $derived(getAllLoadingChats().includes(conversation.id));
@@ -70,10 +71,26 @@
 		}
 	}

+	function handleMouseLeave() {
+		if (!dropdownOpen) {
+			renderActionsDropdown = false;
+		}
+	}
+
+	function handleMouseOver() {
+		renderActionsDropdown = true;
+	}
+
 	function handleSelect() {
 		onSelect?.(conversation.id);
 	}

+	$effect(() => {
+		if (!dropdownOpen) {
+			renderActionsDropdown = false;
+		}
+	});
+
 	onMount(() => {
 		document.addEventListener('edit-active-conversation', handleGlobalEditEvent as EventListener);

@@ -86,19 +103,23 @@
 	});
 </script>

-<div
-	class="conversation-item group relative flex min-h-9 w-full items-center justify-between space-x-3 rounded-lg py-1.5 transition-colors hover:bg-foreground/10 {isActive
+<!-- svelte-ignore a11y_mouse_events_have_key_events -->
+<button
+	class="group flex min-h-9 w-full cursor-pointer items-center justify-between space-x-3 rounded-lg py-1.5 text-left transition-colors hover:bg-foreground/10 {isActive
 		? 'bg-foreground/5 text-accent-foreground'
 		: ''} px-3"
+	onclick={handleSelect}
+	onmouseover={handleMouseOver}
+	onmouseleave={handleMouseLeave}
+	onfocusin={handleMouseOver}
+	onfocusout={(e) => {
+		if (!e.currentTarget.contains(e.relatedTarget as Node | null)) {
+			handleMouseLeave();
+		}
+	}}
 >
-	<button
-		class="absolute inset-0 z-0 cursor-pointer rounded-lg focus:outline-none focus-visible:ring-2 focus-visible:ring-ring"
-		onclick={handleSelect}
-		aria-label={conversation.name}
-	>
-	</button>
 	<div
-		class="pointer-events-none relative z-10 flex min-w-0 flex-1 items-center gap-2"
+		class="flex min-w-0 flex-1 items-center gap-2"
 		style:padding-left="{depth * FORK_TREE_DEPTH_PADDING}px"
 	>
 		{#if depth > 0}
@@ -109,7 +130,7 @@
 						<a
 							{...props}
 							href={RouterService.chat(conversation.forkedFromConversationId)}
-							class="pointer-events-auto flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
+							class="flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
 						>
 							<GitBranch class="h-3.5 w-3.5" />
 						</a>
@@ -125,15 +146,18 @@
 		{#if isLoading}
 			<Tooltip.Root>
 				<Tooltip.Trigger>
-					<button
-						class="stop-button pointer-events-auto flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
+					<div
+						class="stop-button flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
 						onclick={handleStop}
+						onkeydown={(e) => e.key === 'Enter' && handleStop(e)}
+						role="button"
+						tabindex="0"
 						aria-label="Stop generation"
 					>
 						<Loader2 class="loading-icon h-3.5 w-3.5 animate-spin" />

 						<Square class="stop-icon hidden h-3 w-3 fill-current text-destructive" />
-					</button>
+					</div>
 				</Tooltip.Trigger>

 				<Tooltip.Content>
@@ -145,50 +169,52 @@
 		<TruncatedText text={conversation.name} class="text-sm font-medium" showTooltip={false} />
 	</div>

-	<div class="actions pointer-events-auto relative z-20 flex items-center">
-		<DropdownMenuActions
-			triggerIcon={MoreHorizontal}
-			triggerTooltip="More actions"
-			bind:open={dropdownOpen}
-			actions={[
-				{
-					icon: conversation.pinned ? PinOff : Pin,
-					label: conversation.pinned ? 'Unpin' : 'Pin',
-					onclick: (e: Event) => {
-						e.stopPropagation();
-						handleTogglePin();
-					}
-				},
-				{
-					icon: Pencil,
-					label: 'Edit',
-					onclick: handleEdit,
-					shortcut: ['shift', 'cmd', 'e']
-				},
-				{
-					icon: Download,
-					label: 'Export',
-					onclick: (e: Event) => {
-						e.stopPropagation();
-						conversationsStore.downloadConversation(conversation.id);
+	{#if renderActionsDropdown}
+		<div class="actions flex items-center">
+			<DropdownMenuActions
+				triggerIcon={MoreHorizontal}
+				triggerTooltip="More actions"
+				bind:open={dropdownOpen}
+				actions={[
+					{
+						icon: conversation.pinned ? PinOff : Pin,
+						label: conversation.pinned ? 'Unpin' : 'Pin',
+						onclick: (e: Event) => {
+							e.stopPropagation();
+							handleTogglePin();
+						}
 					},
-					shortcut: ['shift', 'cmd', 's']
-				},
-				{
-					icon: Trash2,
-					label: 'Delete',
-					onclick: handleDelete,
-					variant: 'destructive',
-					shortcut: ['shift', 'cmd', 'd'],
-					separator: true
-				}
-			]}
-		/>
-	</div>
-</div>
+					{
+						icon: Pencil,
+						label: 'Edit',
+						onclick: handleEdit,
+						shortcut: ['shift', 'cmd', 'e']
+					},
+					{
+						icon: Download,
+						label: 'Export',
+						onclick: (e: Event) => {
+							e.stopPropagation();
+							conversationsStore.downloadConversation(conversation.id);
+						},
+						shortcut: ['shift', 'cmd', 's']
+					},
+					{
+						icon: Trash2,
+						label: 'Delete',
+						onclick: handleDelete,
+						variant: 'destructive',
+						shortcut: ['shift', 'cmd', 'd'],
+						separator: true
+					}
+				]}
+			/>
+		</div>
+	{/if}
+</button>

 <style>
-	.conversation-item {
+	button {
 		:global([data-slot='dropdown-menu-trigger']:not([data-state='open'])) {
 			opacity: 0;
 		}
@@ -213,8 +239,7 @@
 			}
 		}

-		&:is(:hover) .stop-button,
-		&:focus-within .stop-button {
+		&:is(:hover) .stop-button {
 			:global(.stop-icon) {
 				display: block;
 			}
@@ -154,7 +154,13 @@ class ChatStore {
 		});
 		if (convId === conversationsStore.activeConversation?.id) this.currentResponse = response;
 	}
-	private clearChatStreaming(convId: string): void {
+	private clearChatStreaming(convId: string, messageId?: string): void {
+		// session aware: a stale generation must not wipe a newer one's streaming state on the
+		// same conversation, that would drop the frozen stop identity and stop the wrong session
+		if (messageId !== undefined) {
+			const cur = this.chatStreamingStates.get(convId);
+			if (cur && cur.messageId !== messageId) return;
+		}
 		this.chatStreamingStates.delete(convId);
 		if (convId === conversationsStore.activeConversation?.id) this.currentResponse = '';
 	}
@@ -1055,11 +1061,14 @@ class ChatStore {
 		modelOverride?: string | null,
 		firstUserMessageContent?: string
 	): Promise<void> {
-		let effectiveModel = modelOverride;
+		// the ::model suffix in the stream identity is only for router mode, where it routes to the
+		// owning child. in single-model mode the identity stays the bare conv id so that attach, stop
+		// and reattach all agree, regardless of fresh send vs regenerate passing a resolved model
+		let effectiveModel: string | null | undefined = undefined;

-		if (isRouterMode() && !effectiveModel) {
+		if (isRouterMode()) {
 			const conversationModel = this.getConversationModel(allMessages);
-			effectiveModel = selectedModelName() || conversationModel;
+			effectiveModel = modelOverride || selectedModelName() || conversationModel;
 		}

 		if (isRouterMode() && effectiveModel) {
@@ -1074,6 +1083,9 @@ class ChatStore {
 		let resolvedModel: string | null = null;
 		let modelPersisted = false;
 		const convId = assistantMessage.convId;
+		// freeze the POST identity from t0 so a stop cancels with the exact session key,
+		// never a stale or empty model resolved later
+		this.setChatStreaming(convId, streamedContent, currentMessageId, effectiveModel);

 		const recordModel = (modelName: string | null | undefined, persistImmediately = true): void => {
 			if (!modelName) return;
@@ -1103,7 +1115,7 @@ class ChatStore {
 		};

 		const updateStreamingUI = () => {
-			this.setChatStreaming(convId, streamedContent, currentMessageId);
+			this.setChatStreaming(convId, streamedContent, currentMessageId, effectiveModel);
 			const idx = conversationsStore.findMessageIndex(currentMessageId);
 			conversationsStore.updateMessageAtIndex(idx, { content: streamedContent });
 		};
@@ -1111,7 +1123,7 @@ class ChatStore {
 		const cleanupStreamingState = () => {
 			this.setStreamingActive(false);
 			this.setChatLoading(convId, false);
-			this.clearChatStreaming(convId);
+			this.clearChatStreaming(convId, currentMessageId);
 			this.setProcessingState(convId, null);
 		};

@@ -1128,7 +1140,7 @@ class ChatStore {
 			onReasoningChunk: (chunk: string) => {
 				streamedReasoningContent += chunk;
 				// mark streaming state so a stop mid-thinking can persist the partial reasoning
-				this.setChatStreaming(convId, streamedContent, currentMessageId);
+				this.setChatStreaming(convId, streamedContent, currentMessageId, effectiveModel);
 				const idx = conversationsStore.findMessageIndex(currentMessageId);
 				conversationsStore.updateMessageAtIndex(idx, {
 					reasoningContent: streamedReasoningContent
@@ -1405,7 +1417,7 @@ class ChatStore {
 		// detached drain keeps producing tokens until eos or max_tokens. use the frozen identity
 		// captured when the session started, not the live dropdown
 		const streamStateForStop = this.chatStreamingStates.get(convId);
-		const modelForStop = streamStateForStop?.model ?? selectedModelName();
+		const modelForStop = streamStateForStop?.model;
 		void ChatService.cancelServerStream(convId, modelForStop);
 		this.abortRequest(convId);
 		this.setChatLoading(convId, false);
@@ -1846,6 +1858,14 @@ class ChatStore {
 						updateStreamingContent(originalContent + appendedContent);
 						this.setChatReasoning(msg.convId, false);
 					},
+					onCompletionId: (id: string) => {
+						if (!id) return;
+						// refresh the message id so a later skip targets the live slot after a continue
+						conversationsStore.updateMessageAtIndex(conversationsStore.findMessageIndex(msg.id), {
+							completionId: id
+						});
+						DatabaseService.updateMessage(msg.id, { completionId: id }).catch(() => {});
+					},
 					onReasoningChunk: (chunk: string) => {
 						appendedReasoning += chunk;
 						hasReceivedContent = true;
Author	SHA1	Message	Date
Xuan-Son Nguyen	b3fed31b99	jinja, chat: add --reasoning-preserve flag (#25105 ) * jinja, chat: add --reasoning-preserve flag * correct help message	2026-06-28 23:33:51 +02:00
Aleksander Grygier	dbdaece23d	Revert "ui: fix accessibility for hover-gated interactive elements assisted by claude(in debugging and tests) (#24727 )" (#25098 )	2026-06-28 21:30:03 +02:00
Pascal	7cb8576e7c	ui: fix stop and reasoning skip in single-model mode (#25084 )	2026-06-28 21:06:43 +02:00
Ruixiang Wang	fa72bc6826	dflash: refactor draft model conversion (#25110 ) * dflash: refactor draft model conversion * apply fix for eagle3 convert	2026-06-28 20:31:48 +02:00