ui: add sync blocks so display/behavior settings can be set via --ui-config-file (#25132 )

* ui: add sync blocks so display/behavior settings can be set via --ui-config-file * ui: remove enable thinking setting
ggml : fix broken CPU concat implementation for quantized types (#25247 )
2026-07-04 21:23:03 +02:00 · 2026-07-04 16:12:27 +02:00 · 2026-07-04 13:37:37 +02:00 · 2026-07-03 23:12:11 +02:00 · 2026-07-03 19:03:51 +02:00 · 2026-07-03 17:32:48 +02:00
17 changed files with 270 additions and 227 deletions
@@ -2378,6 +2378,23 @@ static void func_args_not_string(json & messages) {
    }
 }

+// Trim leading/trailing whitespace from message contents before rendering. This
+// has to run on the messages (not on the rendered JSON) because templates with
+// string-only content caps concatenate typed content parts into a single string
+// during rendering, after which the per-part whitespace can no longer be reached.
+// Both the plain string content and the text of typed content parts are trimmed.
+static void trim_all_content(std::vector<common_chat_msg> & messages) {
+    for (auto & message : messages) {
+        message.content           = trim_whitespace(message.content);
+        message.reasoning_content = trim_whitespace(message.reasoning_content);
+        for (auto & part : message.content_parts) {
+            if (part.type == "text") {
+                part.text = trim_whitespace(part.text);
+            }
+        }
+    }
+}
+
 }

 // MiniCPM5 format:
@@ -2634,7 +2651,16 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default;
    const auto & src             = tmpl.source();
    const auto & caps            = tmpl.original_caps();
-    params.messages              = render_message_to_json(inputs.messages, tmpl.original_caps());
+    std::vector<common_chat_msg>        trimmed_messages;
+    const std::vector<common_chat_msg> * messages_to_render = &inputs.messages;
+    if (src.find("You have access to the following functions in JSONSchema format") != std::string::npos) {
+        // StepFun: trim message contents (including typed content parts) before rendering,
+        // otherwise leftover whitespace drives the model into reasoning loops (issue #24181)
+        trimmed_messages   = inputs.messages;
+        workaround::trim_all_content(trimmed_messages);
+        messages_to_render = &trimmed_messages;
+    }
+    params.messages              = render_message_to_json(*messages_to_render, tmpl.original_caps());
    params.tool_choice           = inputs.tool_choice;
    params.reasoning_format      = inputs.reasoning_format;
    params.enable_thinking       = inputs.enable_thinking;
@@ -955,10 +955,11 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
        LOG_INF("%s: - block_size=%d, mask_token_id=%d, n_extract=%u\n", __func__, block_size, mask_token_id, target_layer_ids_n);

        // DFlash input is [id_last, <mask> * (block_size-1)], so it can draft at most block_size-1 tokens per step
-        if (this->params.n_max > block_size - 1) {
-            LOG_WRN("%s: requested draft size %d exceeds the trained DFlash block size %d -- clamping to %d draft tokens per step\n",
-                    __func__, this->params.n_max, block_size - 1, block_size - 1);
-            this->params.n_max = block_size - 1;
+        if (this->params.n_max > block_size - 1 || this->params.n_min > block_size - 1) {
+            LOG_WRN("%s: requested draft size (n_max=%d, n_min=%d) exceeds the trained DFlash block size %d -- clamping to %d\n",
+                    __func__, this->params.n_max, this->params.n_min, block_size, block_size - 1);
+            this->params.n_max = std::min(this->params.n_max, block_size - 1);
+            this->params.n_min = std::min(this->params.n_min, block_size - 1);
        }

        batch        = llama_batch_init(llama_n_batch(ctx_dft), 0,          n_seq);
@@ -968,7 +969,7 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
        for (auto & s : smpls) {
            common_params_sampling sparams;
            sparams.no_perf  = false;
-            sparams.top_k    = 1;
+            sparams.top_k    = 10;
            sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
            s.reset(common_sampler_init(model_dft, sparams));
        }
@@ -1173,10 +1174,18 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {

                const llama_token id = cur_p->data[0].id;

+                if (cur_p->data[0].p < params.p_min) {
+                    break;
+                }
+
                common_sampler_accept(smpl, id, true);

                result.push_back(id);
            }
+
+            if (result.size() < (size_t) params.n_min) {
+                result.clear();
+            }
        }
    }

@@ -1913,7 +1913,11 @@ static void ggml_compute_forward_concat_any(
    GGML_ASSERT(dim >= 0 && dim < 4);

    int64_t o[4] = {0, 0, 0, 0};
-    o[dim] = src0->ne[dim];
+    if (dim == 0) {
+        o[dim] = src0->ne[dim]/ggml_blck_size(src0->type);
+    } else {
+        o[dim] = src0->ne[dim];
+    }

    const char * x;

@@ -1921,8 +1925,8 @@ static void ggml_compute_forward_concat_any(
    for (int i3 = 0; i3 < ne3; i3++) {
        for (int i2 = ith; i2 < ne2; i2 += nth) {
            for (int i1 = 0; i1 < ne1; i1++) {
-                for (int i0 = 0; i0 < ne0; i0++) {
-                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                for (int i0 = 0; i0 < ne0/ggml_blck_size(dst->type); i0++) {
+                    if (i0 < ne00/ggml_blck_size(src0->type) && i1 < ne01 && i2 < ne02 && i3 < ne03) {
                        x = (const char *)src0->data + (i0       )*nb00 + (i1       )*nb01 + (i2       )*nb02 + (i3       )*nb03;
                    } else {
                        x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13;
@@ -2071,6 +2075,14 @@ void ggml_compute_forward_concat(
    ggml_tensor * dst) {

    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    if (ggml_is_quantized(src0->type)) {
+        GGML_ASSERT(ggml_is_contiguous(src0));
+        GGML_ASSERT(ggml_is_contiguous(src1));
+        GGML_ASSERT(src0->ne[0] % ggml_blck_size(src0->type) == 0);
+        GGML_ASSERT(src1->ne[0] % ggml_blck_size(src1->type) == 0);
+    }

    switch (src0->type) {
        case GGML_TYPE_F16:
@@ -312,6 +312,10 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
            ggml_cuda_kernel_launch(topk_moe_cuda<256, has_bias>, launch_params,
                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
            break;
+        case 288: // StepFun 3.7
+            ggml_cuda_kernel_launch(topk_moe_cuda<288, has_bias>, launch_params,
+                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
+            break;
        case 512:
            ggml_cuda_kernel_launch(topk_moe_cuda<512, has_bias>, launch_params,
                logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
@@ -377,8 +381,10 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * gating_op,
                                   const ggml_tensor * weights,
                                   const ggml_tensor * logits,
                                   const ggml_tensor * ids) {
+    // must match an instantiation of launch_topk_moe_cuda: a power of 2 up to 512,
+    // or one of the non-power-of-2 expert counts of supported models
    const int n_expert = ids->nb[1] / ids->nb[0];
-    if (((n_expert & (n_expert - 1)) != 0 || n_expert > 512) && n_expert != 576) {
+    if (((n_expert & (n_expert - 1)) != 0 || n_expert > 512) && n_expert != 288 && n_expert != 576) {
        return false;
    }

@@ -1,80 +0,0 @@
-{% macro render_content(content) %}{% if content is none %}{{- '' }}{% elif content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}<im_patch>{% endif %}{% endfor %}{% endif %}{% endmacro %}
-{{bos_token}}{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0].role == 'system' %}
-        {{- render_content(messages[0].content) + '\n\n' }}
-    {%- endif %}
-    {{- "# Tools\n\nYou have access to the following functions in JSONSchema format:\n\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson(ensure_ascii=False) }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...>\n...\n</function> block must be nested within <tool_call>\n...\n</tool_call> XML tags\n- Required parameters MUST be specified\n</IMPORTANT><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0].role == 'system' %}
-        {{- '<|im_start|>system\n' + render_content(messages[0].content) + '<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
-{%- for message in messages[::-1] %}
-    {%- set index = (messages|length - 1) - loop.index0 %}
-    {%- if ns.multi_step_tool and message.role == "user" and render_content(message.content) is string and not(render_content(message.content).startswith('<tool_response>') and render_content(message.content).endswith('</tool_response>')) %}
-        {%- set ns.multi_step_tool = false %}
-        {%- set ns.last_query_index = index %}
-    {%- endif %}
-{%- endfor %}
-{%- for message in messages %}
-    {%- set content = render_content(message.content) %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-        {%- set role_name = 'observation' if (message.role == "system" and not loop.first and message.name == 'observation') else message.role %}
-        {{- '<|im_start|>' + role_name + '\n' + content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {%- if message.reasoning_content is string %}
-            {%- set reasoning_content = render_content(message.reasoning_content) %}
-        {%- else %}
-            {%- if '</think>' in content %}
-                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
-            {%- else %}
-                {%- set reasoning_content = '' %}
-            {%- endif %}
-        {%- endif %}
-        {%- if loop.index0 > ns.last_query_index %}
-            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n' + content }}
-        {%- else %}
-            {{- '<|im_start|>' + message.role + '\n' + content }}
-        {%- endif %}
-        {%- if message.tool_calls %}
-            {%- for tool_call in message.tool_calls %}
-                {%- if tool_call.function is defined %}
-                    {%- set tool_call = tool_call.function %}
-                {%- endif %}
-                {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
-                {%- if tool_call.arguments is defined %}
-                    {%- set arguments = tool_call.arguments %}
-                    {%- for args_name, args_value in arguments|items %}
-                        {{- '<parameter=' + args_name + '>\n' }}
-                        {%- set args_value = args_value | tojson(ensure_ascii=False) | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
-                        {{- args_value }}
-                        {{- '\n</parameter>\n' }}
-                    {%- endfor %}
-                {%- endif %}
-                {{- '</function>\n</tool_call>' }}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>tool_response\n' }}
-        {%- endif %}
-        {{- '<tool_response>' }}
-        {{- content }}
-        {{- '</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n<think>\n' }}
-{%- endif %}
@@ -8918,6 +8918,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
        }
    }

+    for (ggml_type type_a : { GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0 }) {
+        for (int dim : { 0, 1, 2, 3, }) {
+            test_cases.emplace_back(new test_concat(type_a, {128, 12, 13, 14}, dim == 0 ? 256 : 7, dim, 0));
+        }
+    }
+
    for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
        for (uint32_t i = 4; i <= 1024*1024; i *= 2) {
            test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {i-1, 1, 1, 1}));
@@ -9219,6 +9225,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                    test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm, bias_probs, gate, scale_w));
                    test_cases.emplace_back(new test_topk_moe({129, 1, 1, 1}, 128, with_norm, bias_probs, gate, scale_w));
                    test_cases.emplace_back(new test_topk_moe({160, 4, 1, 1}, 160, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({288, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w)); // Used by StepFun 3.7
                }
            }
        }
@@ -1887,7 +1887,6 @@ static void test_role_markers_all_templates(testing & t) {
        { "Qwen-Qwen3-0.6B.jinja",                           "<|im_start|>user",       "<|im_start|>assistant"      },
        { "Qwen-QwQ-32B.jinja",                              "<|im_start|>user",       "<|im_start|>assistant"      },
        { "StepFun3.5-Flash.jinja",                          "<|im_start|>user",       "<|im_start|>assistant"      },
-        { "stepfun-ai-Step-3.5-Flash.jinja",                 "<|im_start|>user",       "<|im_start|>assistant"      },

        // DeepSeek family
        { "deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja",  "<｜User｜>",                "<｜Assistant｜>"             },
@@ -3155,6 +3155,59 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
                }
            }
        }
+
+        {
+            // StepFun trimming regression test (see https://github.com/ggml-org/llama.cpp/pull/25238)
+            auto tmpls = read_templates("models/templates/StepFun3.5-Flash.jinja");
+
+            common_chat_msg message_chatbot = simple_assist_msg("Let me check.\n\n", "I am thinking.\n\n");
+
+            {
+                common_chat_templates_inputs inputs;
+                inputs.messages              = { message_chatbot };
+                inputs.add_generation_prompt = true;
+
+                auto params = common_chat_templates_apply(tmpls.get(), inputs);
+
+                if (params.prompt.find("Let me check.\n\n") != std::string::npos) {
+                    throw std::runtime_error("StepFun 3.5: content not trimmed");
+                }
+
+                if (params.prompt.find("I am thinking.\n\n") != std::string::npos) {
+                    throw std::runtime_error("StepFun 3.5: reasoning_content not trimmed");
+                }
+            }
+
+            {
+                // Trimming must also reach typed (text) content parts, not just string content
+                // (see https://github.com/ggml-org/llama.cpp/pull/25238)
+                common_chat_msg message_parts;
+                message_parts.role          = "user";
+                message_parts.content_parts = {
+                    { /* .type = */ "text", /* .text = */ "First part.\n\n" },
+                    { /* .type = */ "media_marker", /* .text = */ "<__media__>" },
+                    { /* .type = */ "text", /* .text = */ "Second part.\n\n" },
+                };
+
+                common_chat_templates_inputs inputs;
+                inputs.messages              = { message_parts };
+                inputs.add_generation_prompt = true;
+
+                auto params = common_chat_templates_apply(tmpls.get(), inputs);
+
+                if (params.prompt.find("First part.\n\n") != std::string::npos ||
+                    params.prompt.find("Second part.\n\n") != std::string::npos) {
+                    throw std::runtime_error("StepFun 3.5: text content parts not trimmed");
+                }
+
+                // the trimmed text itself must still be present
+                if (params.prompt.find("First part.") == std::string::npos ||
+                    params.prompt.find("Second part.") == std::string::npos) {
+                    throw std::runtime_error("StepFun 3.5: text content parts missing after trim");
+                }
+            }
+        }
+
    }

    {
@@ -20,9 +20,9 @@
 		agenticInjectSteeringMessage
 	} from '$lib/stores/agentic.svelte';
 	import {
+		buildSiblingInfoMap,
 		copyToClipboard,
 		formatMessageForClipboard,
-		getMessageSiblings,
 		hasAgenticContent
 	} from '$lib/utils';

@@ -169,6 +169,8 @@
 		});
 	});

+	let siblingInfoByMessageId = $derived(buildSiblingInfoMap(allConversationMessages));
+
 	let displayMessages = $derived.by(() => {
 		if (!messages.length) {
 			return [];
@@ -223,18 +225,18 @@
 				}
 			}

-			const siblingInfo = getMessageSiblings(allConversationMessages, msg.id);
+			const siblingInfo = siblingInfoByMessageId.get(msg.id) ?? {
+				message: msg,
+				siblingIds: [msg.id],
+				currentIndex: 0,
+				totalSiblings: 1
+			};

 			result.push({
 				message: msg,
 				toolMessages,
 				isLastAssistantMessage: false,
-				siblingInfo: siblingInfo || {
-					message: msg,
-					siblingIds: [msg.id],
-					currentIndex: 0,
-					totalSiblings: 1
-				}
+				siblingInfo
 			});
 		}

@@ -37,3 +37,8 @@ export const MODEL_ACTIVATED_PARAMS_RE = /^[Aa]\d+(\.\d+)?[BbMmKkTt]$/;
 * Container format segments to exclude from tags (every model uses these).
 */
 export const MODEL_IGNORED_SEGMENTS = new Set(['GGUF', 'GGML']);
+
+/**
+ * Matches a trailing weight file extension, e.g. `model.gguf` -> `model`.
+ */
+export const MODEL_WEIGHT_EXTENSION_RE = /\.(gguf|ggml)$/i;
@@ -69,7 +69,6 @@ export const SETTINGS_KEYS = {
 	// Developer
 	DISABLE_REASONING_PARSING: 'disableReasoningParsing',
 	EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',
-	ENABLE_THINKING: 'enableThinking',
 	SHOW_RAW_OUTPUT_SWITCH: 'showRawOutputSwitch',
 	// PY_INTERPRETER_ENABLED: 'pyInterpreterEnabled',
 	JS_SANDBOX_ENABLED: 'jsSandboxEnabled',
@@ -185,7 +185,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				defaultValue: false,
 				type: SettingsFieldType.CHECKBOX,
 				section: SETTINGS_SECTION_SLUGS.GENERAL,
-				isExperimental: true
+				isExperimental: true,
+				sync: {
+					serverKey: SETTINGS_KEYS.TITLE_GENERATION_USE_LLM,
+					paramType: SyncableParameterType.BOOLEAN
+				}
 			},
 			{
 				key: SETTINGS_KEYS.TITLE_GENERATION_PROMPT,
@@ -193,7 +197,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'Optional template for the title generation prompt. Use {{USER}} for the user message and {{ASSISTANT}} for the assistant message.',
 				defaultValue: TITLE_GENERATION.DEFAULT_PROMPT,
 				type: SettingsFieldType.TEXTAREA,
-				section: SETTINGS_SECTION_SLUGS.GENERAL
+				section: SETTINGS_SECTION_SLUGS.GENERAL,
+				sync: {
+					serverKey: SETTINGS_KEYS.TITLE_GENERATION_PROMPT,
+					paramType: SyncableParameterType.STRING
+				}
 			},
 			{
 				key: SETTINGS_KEYS.MAX_IMAGE_RESOLUTION,
@@ -201,7 +209,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'Images larger than this will be resized before sending to server. Set to 0 to disable.',
 				defaultValue: 0,
 				type: SettingsFieldType.INPUT,
-				section: SETTINGS_SECTION_SLUGS.GENERAL
+				section: SETTINGS_SECTION_SLUGS.GENERAL,
+				sync: {
+					serverKey: SETTINGS_KEYS.MAX_IMAGE_RESOLUTION,
+					paramType: SyncableParameterType.NUMBER
+				}
 			}
 		]
 	},
@@ -385,7 +397,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'Display the current build version in the bottom-right corner of the interface.',
 				defaultValue: false,
 				type: SettingsFieldType.CHECKBOX,
-				section: SETTINGS_SECTION_SLUGS.DISPLAY
+				section: SETTINGS_SECTION_SLUGS.DISPLAY,
+				sync: {
+					serverKey: SETTINGS_KEYS.SHOW_BUILD_VERSION,
+					paramType: SyncableParameterType.BOOLEAN
+				}
 			}
 		]
 	},
@@ -669,7 +685,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'After each response, re-submit the conversation to pre-fill the server KV cache. Makes the next turn faster since the prompt is already encoded while you read the response.',
 				defaultValue: false,
 				type: SettingsFieldType.CHECKBOX,
-				section: SETTINGS_SECTION_SLUGS.DEVELOPER
+				section: SETTINGS_SECTION_SLUGS.DEVELOPER,
+				sync: {
+					serverKey: SETTINGS_KEYS.PRE_ENCODE_CONVERSATION,
+					paramType: SyncableParameterType.BOOLEAN
+				}
 			},
 			{
 				key: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
@@ -677,7 +697,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'Send reasoning_format=none so the server returns thinking tokens inline instead of extracting them into a separate field.',
 				defaultValue: false,
 				type: SettingsFieldType.CHECKBOX,
-				section: SETTINGS_SECTION_SLUGS.DEVELOPER
+				section: SETTINGS_SECTION_SLUGS.DEVELOPER,
+				sync: {
+					serverKey: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
+					paramType: SyncableParameterType.BOOLEAN
+				}
 			},
 			{
 				key: SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,
@@ -691,14 +715,6 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 					paramType: SyncableParameterType.BOOLEAN
 				}
 			},
-			{
-				key: SETTINGS_KEYS.ENABLE_THINKING,
-				label: 'Enable thinking',
-				help: 'Enable model thinking/reasoning for each request. When off, the model will skip the thinking phase and go straight to the response.',
-				defaultValue: false,
-				type: SettingsFieldType.CHECKBOX,
-				section: SETTINGS_SECTION_SLUGS.DEVELOPER
-			},
 			{
 				key: SETTINGS_KEYS.SHOW_RAW_OUTPUT_SWITCH,
 				label: 'Enable raw output toggle',
@@ -717,7 +733,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'Expose a run_javascript tool to the model. Code runs in a Web Worker inside a sandboxed iframe with an opaque origin, isolated from the WebUI and its API, with a hard timeout.',
 				defaultValue: false,
 				type: SettingsFieldType.CHECKBOX,
-				section: SETTINGS_SECTION_SLUGS.DEVELOPER
+				section: SETTINGS_SECTION_SLUGS.DEVELOPER,
+				sync: {
+					serverKey: SETTINGS_KEYS.JS_SANDBOX_ENABLED,
+					paramType: SyncableParameterType.BOOLEAN
+				}
 			},
 			{
 				key: SETTINGS_KEYS.CUSTOM_JSON,
@@ -753,7 +773,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				defaultValue: DEFAULT_MCP_CONFIG.requestTimeoutSeconds,
 				type: SettingsFieldType.INPUT,
 				section: SETTINGS_SECTION_SLUGS.MCP,
-				isPositiveInteger: true
+				isPositiveInteger: true,
+				sync: {
+					serverKey: SETTINGS_KEYS.MCP_REQUEST_TIMEOUT_SECONDS,
+					paramType: SyncableParameterType.NUMBER
+				}
 			}
 		]
 	}
@@ -551,13 +551,49 @@ const mcpDefaultEnabledMigration: Migration = {
 	}
 };

+const CONFIG_TYPES_MIGRATION_ID = 'config-type-normalization-v1';
+
+const configTypesMigration: Migration = {
+	id: CONFIG_TYPES_MIGRATION_ID,
+	description: 'Coerce legacy string-encoded booleans in persisted config to real booleans',
+
+	async run(): Promise<void> {
+		const configRaw = localStorage.getItem(CONFIG_LOCALSTORAGE_KEY);
+		if (configRaw === null) return;
+
+		const config = JSON.parse(configRaw);
+		let changed = false;
+
+		// Pre-schema configs persisted booleans as the strings "true"/"false", which the
+		// strict server schema now rejects. Coerce those back to real booleans. No config
+		// string field holds exactly "true"/"false", so the match is unambiguous.
+		for (const key of Object.keys(config)) {
+			if (config[key] === 'true') {
+				config[key] = true;
+				changed = true;
+			} else if (config[key] === 'false') {
+				config[key] = false;
+				changed = true;
+			}
+		}
+
+		if (changed) {
+			localStorage.setItem(CONFIG_LOCALSTORAGE_KEY, JSON.stringify(config));
+		}
+
+		if (import.meta.env.DEV && import.meta.env.VITE_DEBUG)
+			console.log(`[Migration] Config types: coerced string booleans (changed=${changed})`);
+	}
+};
+
 const migrations: Migration[] = [
 	localStorageMigration,
 	idxdbMigration,
 	legacyMessageMigration,
 	themeMigration,
 	customJsonKeyMigration,
-	mcpDefaultEnabledMigration
+	mcpDefaultEnabledMigration,
+	configTypesMigration
 ];

 export const MigrationService = {
@@ -1,5 +1,5 @@
 import { ServerModelStatus } from '$lib/enums';
-import { apiFetch, apiPost } from '$lib/utils';
+import { apiFetch, apiPost, normalizeModelName } from '$lib/utils';
 import type { ParsedModelId } from '$lib/types/models';
 import {
 	MODEL_QUANTIZATION_SEGMENT_RE,
@@ -7,6 +7,7 @@ import {
 	MODEL_PARAMS_RE,
 	MODEL_ACTIVATED_PARAMS_RE,
 	MODEL_IGNORED_SEGMENTS,
+	MODEL_WEIGHT_EXTENSION_RE,
 	MODEL_ID_NOT_FOUND,
 	MODEL_ID_ORG_SEPARATOR,
 	MODEL_ID_SEGMENT_SEPARATOR,
@@ -139,15 +140,19 @@ export class ModelsService {
 			tags: []
 		};

+		// strip directory path and weight extension so a bare `-m /path/file.gguf`
+		// parses like a clean repo id; the HF `org/model` form is preserved
+		const source = normalizeModelName(modelId).replace(MODEL_WEIGHT_EXTENSION_RE, '');
+
 		// 1. Extract colon-separated quantization (e.g. `model:Q4_K_M`)
-		const colonIdx = modelId.indexOf(MODEL_ID_QUANTIZATION_SEPARATOR);
+		const colonIdx = source.indexOf(MODEL_ID_QUANTIZATION_SEPARATOR);
 		let modelPath: string;

 		if (colonIdx !== MODEL_ID_NOT_FOUND) {
-			result.quantization = modelId.slice(colonIdx + 1) || null;
-			modelPath = modelId.slice(0, colonIdx);
+			result.quantization = source.slice(colonIdx + 1) || null;
+			modelPath = source.slice(0, colonIdx);
 		} else {
-			modelPath = modelId;
+			modelPath = source;
 		}

 		// 2. Extract org name (e.g. `org/model` -> org = "org")
@@ -114,14 +114,13 @@ class ConversationsStore {

 	/** Load thinking-enabled default from localStorage */
 	private static loadThinkingDefaults(): boolean {
-		if (typeof globalThis.localStorage === 'undefined') return false;
+		if (typeof globalThis.localStorage === 'undefined') return true;
 		try {
 			const raw = localStorage.getItem(THINKING_ENABLED_DEFAULT_LOCALSTORAGE_KEY);
-			if (!raw) return false;
-			const parsed = raw === 'true';
-			return typeof parsed === 'boolean' ? parsed : false;
+			if (!raw) return true;
+			return raw === 'true';
 		} catch {
-			return false;
+			return true;
 		}
 	}

@@ -333,7 +332,7 @@ class ConversationsStore {
 			}

 			this.pendingMcpServerOverrides = [];
-			this.pendingThinkingEnabled = false;
+			this.pendingThinkingEnabled = ConversationsStore.loadThinkingDefaults();
 			this.activeConversation = conversation;

 			if (conversation.currNode) {
@@ -92,18 +92,14 @@ export function filterByLeafNodeId(
 * Finds the leaf node (message with no children) for a given message branch.
 * Traverses down the tree following the last child until reaching a leaf.
 *
- * @param messages - All messages in the conversation
+ * @param nodeMap - Map of messages keyed by ID
 * @param messageId - Starting message ID to find leaf for
 * @returns The leaf node ID, or the original messageId if no children
 */
-export function findLeafNode(messages: readonly DatabaseMessage[], messageId: string): string {
-	const nodeMap = new Map<string, DatabaseMessage>();
-
-	// Build node map for quick lookups
-	for (const msg of messages) {
-		nodeMap.set(msg.id, msg);
-	}
-
+function findLeafNodeInMap(
+	nodeMap: ReadonlyMap<string, DatabaseMessage>,
+	messageId: string
+): string {
 	let currentNode: DatabaseMessage | undefined = nodeMap.get(messageId);
 	while (currentNode && currentNode.children.length > 0) {
 		// Follow the last child (most recent branch)
@@ -114,6 +110,22 @@ export function findLeafNode(messages: readonly DatabaseMessage[], messageId: st
 	return currentNode?.id ?? messageId;
 }

+/**
+ * Convenience wrapper around {@link findLeafNodeInMap} for callers that only have
+ * a flat message array.
+ *
+ * Finds the leaf node (message with no children) for a given message branch.
+ * Traverses down the tree following the last child until reaching a leaf.
+ *
+ * @param messages - All messages in the conversation
+ * @param messageId - Starting message ID to find leaf for
+ * @returns The leaf node ID, or the original messageId if no children
+ */
+export function findLeafNode(messages: readonly DatabaseMessage[], messageId: string): string {
+	const nodeMap = new Map(messages.map((msg) => [msg.id, msg] as const));
+	return findLeafNodeInMap(nodeMap, messageId);
+}
+
 /**
 * Finds all descendant messages (children, grandchildren, etc.) of a given message.
 * This is used for cascading deletion to remove all messages in a branch.
@@ -156,21 +168,14 @@ export function findDescendantMessages(
 * Gets sibling information for a message, including all sibling IDs and current position.
 * Siblings are messages that share the same parent.
 *
- * @param messages - All messages in the conversation
+ * @param nodeMap - Map of messages keyed by ID
 * @param messageId - The message to get sibling info for
 * @returns Sibling information including leaf node IDs for navigation
 */
 export function getMessageSiblings(
-	messages: readonly DatabaseMessage[],
+	nodeMap: ReadonlyMap<string, DatabaseMessage>,
 	messageId: string
 ): ChatMessageSiblingInfo | null {
-	const nodeMap = new Map<string, DatabaseMessage>();
-
-	// Build node map for quick lookups
-	for (const msg of messages) {
-		nodeMap.set(msg.id, msg);
-	}
-
 	const message = nodeMap.get(messageId);
 	if (!message) {
 		return null;
@@ -203,7 +208,9 @@ export function getMessageSiblings(

 	// Convert sibling message IDs to their corresponding leaf node IDs
 	// This allows navigation between different conversation branches
-	const siblingLeafIds = siblingIds.map((siblingId: string) => findLeafNode(messages, siblingId));
+	const siblingLeafIds = siblingIds.map((siblingId: string) =>
+		findLeafNodeInMap(nodeMap, siblingId)
+	);

 	// Find current message's position among siblings
 	const currentIndex = siblingIds.indexOf(messageId);
@@ -217,85 +224,22 @@ export function getMessageSiblings(
 }

 /**
- * Creates a display-ready list of messages with sibling information for UI rendering.
- * This is the main function used by chat components to render conversation branches.
+ * Builds sibling information for every message in a conversation.
+ * A single node map is shared across all lookups for O(1) access.
 *
 * @param messages - All messages in the conversation
- * @param leafNodeId - Current leaf node being viewed
- * @returns Array of messages with sibling navigation info
+ * @returns Map of message ID to its sibling information
 */
-export function getMessageDisplayList(
-	messages: readonly DatabaseMessage[],
-	leafNodeId: string
-): ChatMessageSiblingInfo[] {
-	// Get the current conversation path
-	const currentPath = filterByLeafNodeId(messages, leafNodeId, true);
-	const result: ChatMessageSiblingInfo[] = [];
-
-	// Add sibling info for each message in the current path
-	for (const message of currentPath) {
-		if (message.type === 'root') {
-			continue; // Skip root messages in display
-		}
-
-		const siblingInfo = getMessageSiblings(messages, message.id);
-		if (siblingInfo) {
-			result.push(siblingInfo);
+export function buildSiblingInfoMap(
+	messages: readonly DatabaseMessage[]
+): Map<string, ChatMessageSiblingInfo> {
+	const nodeMap = new Map(messages.map((msg) => [msg.id, msg] as const));
+	const siblingMap = new Map<string, ChatMessageSiblingInfo>();
+	for (const msg of messages) {
+		const info = getMessageSiblings(nodeMap, msg.id);
+		if (info) {
+			siblingMap.set(msg.id, info);
 		}
 	}
-
-	return result;
-}
-
-/**
- * Checks if a message has multiple siblings (indicating branching at that point).
- *
- * @param messages - All messages in the conversation
- * @param messageId - The message to check
- * @returns True if the message has siblings
- */
-export function hasMessageSiblings(
-	messages: readonly DatabaseMessage[],
-	messageId: string
-): boolean {
-	const siblingInfo = getMessageSiblings(messages, messageId);
-	return siblingInfo ? siblingInfo.totalSiblings > 1 : false;
-}
-
-/**
- * Gets the next sibling message ID for navigation.
- *
- * @param messages - All messages in the conversation
- * @param messageId - Current message ID
- * @returns Next sibling's leaf node ID, or null if at the end
- */
-export function getNextSibling(
-	messages: readonly DatabaseMessage[],
-	messageId: string
-): string | null {
-	const siblingInfo = getMessageSiblings(messages, messageId);
-	if (!siblingInfo || siblingInfo.currentIndex >= siblingInfo.totalSiblings - 1) {
-		return null;
-	}
-
-	return siblingInfo.siblingIds[siblingInfo.currentIndex + 1];
-}
-
-/**
- * Gets the previous sibling message ID for navigation.
- *
- * @param messages - All messages in the conversation
- * @param messageId - Current message ID
- * @returns Previous sibling's leaf node ID, or null if at the beginning
- */
-export function getPreviousSibling(
-	messages: readonly DatabaseMessage[],
-	messageId: string
-): string | null {
-	const siblingInfo = getMessageSiblings(messages, messageId);
-	if (!siblingInfo || siblingInfo.currentIndex <= 0) {
-		return null;
-	}
-
-	return siblingInfo.siblingIds[siblingInfo.currentIndex - 1];
+	return siblingMap;
 }
@@ -26,10 +26,7 @@ export {
 	findLeafNode,
 	findDescendantMessages,
 	getMessageSiblings,
-	getMessageDisplayList,
-	hasMessageSiblings,
-	getNextSibling,
-	getPreviousSibling
+	buildSiblingInfoMap
 } from './branching';

 // Code
Author	SHA1	Message	Date
Pascal	665892536d	ui: add sync blocks so display/behavior settings can be set via --ui-config-file (#25132 ) * ui: add sync blocks so display/behavior settings can be set via --ui-config-file * ui: remove enable thinking setting	2026-07-04 16:12:27 +02:00
fairydreaming	ef2d770117	ggml : fix broken CPU concat implementation for quantized types (#25247 ) * ggml : fix broken CPU concat implementation for quantized types * tests : concat tests for quantized types --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>	2026-07-04 13:37:37 +02:00
Piotr Wilkin (ilintar)	2d973636e2	chat: trim messages sent to StepFun parser (fixes long reasoning loops) (#25238 ) * chat: trim messages sent to StepFun parser (fixes long reasoning loops) * add regression test; remove duplicate template * chat: trim StepFun content parts before rendering The StepFun trim workaround ran on the already-rendered messages, where typed content parts have been concatenated into a single string, so the per-part whitespace could no longer be reached. Move the trim ahead of rendering and apply it to content_parts text as well as the string content and reasoning_content. Adds a content-parts regression test. Co-Authored-By: Piotr Wilkin <ilintar@gmail.com> Assisted-By: Claude Fable 5 <noreply@anthropic.com> --------- Co-authored-by: tarruda <tpadilha84@gmail.com>	2026-07-03 23:12:11 +02:00
Nick Towle	d4cff114c0	ui: Improve performance when streaming (#25225 ) * ui: Improve performance when streaming * ui: build sibling info map in branching utils Moves the node map and sibling map construction from the .by block into buildSiblingInfoMap() in branching.ts. The map is built once per structural change and only read afterwards, so it does not need SvelteMap reactivity. Keeping the construction in plain TypeScript fixes the svelte/prefer-svelte-reactivity lint error and groups the branching logic where it already lives. --------- Co-authored-by: Pascal <admin@serveurperso.com>	2026-07-03 19:03:51 +02:00
Pascal	f113e02d5a	ui: strip path and weight extension from model id in single model mode (#25137 )	2026-07-03 17:32:48 +02:00
Ruixiang Wang	152d337fad	spec: support spec-draft-p-min in DFlash (#25246 ) * spec: support spec-draft-p-min in DFlash * dflash: add n_min guard * dflash: guard both n_min and n_max	2026-07-03 15:40:06 +02:00
Piotr Wilkin (ilintar)	75a48a9055	cuda: enable topk-moe fusion for 288 experts (#25267 ) * cuda: enable topk-moe fusion for 288 experts The topk-moe fusion only accepted power-of-2 expert counts (or the special-cased 576), so models with 288 experts (e.g. Step-3.7-Flash) fell back to the unfused per-layer routing chain: softmax/sigmoid, argsort, get_rows, sum_rows, div, clamp, scale. At batch size 1 that is ~330 extra tiny graph nodes per token. 288 is a multiple of the warp size, so the existing kernel already handles it; this adds the missing template instantiation and accepts 288 in the eligibility check. Measured on gfx1151 with Step-3.7-Flash IQ4_XS (llama-bench, -b 4096 -ub 4096 -fa 1 -dio 1 -ctk q8_0 -ctv q8_0; machine idle, before/after paired so pp4096 stays matched as a load control): test \| before \| after ----------------+----------------+---------------- pp4096 \| 460.99 ± 0.45 \| 462.47 ± 0.34 (unchanged) tg128 \| 19.10 ± 0.04 \| 19.56 ± 0.03 (+2.4%) tg128 @ d30000 \| 12.68 ± 0.04 \| 12.69 ± 0.03 (unchanged) Prompt processing is unaffected (the fusion only touches decode routing). The decode gain is ~+2.4% at shallow context and fades with depth: by 30k tokens each step is attention-bound over the KV cache, so removing the fixed routing overhead is no longer visible. Assisted-By: Claude Fable 5 <noreply@anthropic.com> * Update tests/test-backend-ops.cpp Co-authored-by: Oliver Simons <osimons@nvidia.com> * Add comment for case 288 in topk-moe.cu --------- Co-authored-by: Oliver Simons <osimons@nvidia.com>	2026-07-03 15:36:55 +02:00
Pascal	067de93718	ui: align persisted config with strict server schema and enable thinking by default (#25242 ) * ui: migrate legacy string-encoded booleans in persisted config * ui: enable thinking by default Fresh users and legacy conversations without a persisted thinking preference now default to enabled. The per-conversation toggle and the persisted localStorage choice keep taking precedence. Picks up the enable_thinking default from #24876.	2026-07-03 13:14:52 +02:00