llama : add guard for K/V rotation input when buffer is unallocated (#25215 )

llm_graph_input_attn_kv::set_input and llm_graph_input_attn_kv_iswa::set_input call set_input_k_rot / set_input_v_rot whenever the rotation tensor pointer is non-null, but the tensor's buffer can be unallocated (NULL) when a graph only stores K/V without attending -- e.g. DFlash speculative decoding's KV-injection pass. set_input_k_rot then calls ggml_backend_buffer_is_host() on a NULL buffer and aborts with GGML_ASSERT(buffer). Guard the four k_rot/v_rot inputs with the same "&& ->buffer" check that the adjacent kq_mask inputs already use in these two functions. When the buffer is unallocated there is no data to upload, so skipping is correct. Fixes #25191 Signed-off-by: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com>
ui: add sync blocks so display/behavior settings can be set via --ui-config-file (#25132 )
2026-07-05 13:43:04 +02:00 · 2026-07-04 22:37:38 +02:00 · 2026-07-04 16:12:27 +02:00 · 2026-07-04 13:37:37 +02:00
5 changed files with 67 additions and 26 deletions
@@ -1913,7 +1913,11 @@ static void ggml_compute_forward_concat_any(
    GGML_ASSERT(dim >= 0 && dim < 4);

    int64_t o[4] = {0, 0, 0, 0};
-    o[dim] = src0->ne[dim];
+    if (dim == 0) {
+        o[dim] = src0->ne[dim]/ggml_blck_size(src0->type);
+    } else {
+        o[dim] = src0->ne[dim];
+    }

    const char * x;

@@ -1921,8 +1925,8 @@ static void ggml_compute_forward_concat_any(
    for (int i3 = 0; i3 < ne3; i3++) {
        for (int i2 = ith; i2 < ne2; i2 += nth) {
            for (int i1 = 0; i1 < ne1; i1++) {
-                for (int i0 = 0; i0 < ne0; i0++) {
-                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                for (int i0 = 0; i0 < ne0/ggml_blck_size(dst->type); i0++) {
+                    if (i0 < ne00/ggml_blck_size(src0->type) && i1 < ne01 && i2 < ne02 && i3 < ne03) {
                        x = (const char *)src0->data + (i0       )*nb00 + (i1       )*nb01 + (i2       )*nb02 + (i3       )*nb03;
                    } else {
                        x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13;
@@ -2071,6 +2075,14 @@ void ggml_compute_forward_concat(
    ggml_tensor * dst) {

    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    if (ggml_is_quantized(src0->type)) {
+        GGML_ASSERT(ggml_is_contiguous(src0));
+        GGML_ASSERT(ggml_is_contiguous(src1));
+        GGML_ASSERT(src0->ne[0] % ggml_blck_size(src0->type) == 0);
+        GGML_ASSERT(src1->ne[0] % ggml_blck_size(src1->type) == 0);
+    }

    switch (src0->type) {
        case GGML_TYPE_F16:
@@ -494,11 +494,11 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
        mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
    }

-    if (self_k_rot) {
+    if (self_k_rot && self_k_rot->buffer) {
        mctx->set_input_k_rot(self_k_rot);
    }

-    if (self_v_rot) {
+    if (self_v_rot && self_v_rot->buffer) {
        mctx->set_input_v_rot(self_v_rot);
    }
 }
@@ -592,19 +592,19 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
        mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
    }

-    if (self_k_rot) {
+    if (self_k_rot && self_k_rot->buffer) {
        mctx->get_base()->set_input_k_rot(self_k_rot);
    }

-    if (self_v_rot) {
+    if (self_v_rot && self_v_rot->buffer) {
        mctx->get_base()->set_input_v_rot(self_v_rot);
    }

-    if (self_k_rot_swa) {
+    if (self_k_rot_swa && self_k_rot_swa->buffer) {
        mctx->get_swa()->set_input_k_rot(self_k_rot_swa);
    }

-    if (self_v_rot_swa) {
+    if (self_v_rot_swa && self_v_rot_swa->buffer) {
        mctx->get_swa()->set_input_v_rot(self_v_rot_swa);
    }
 }
@@ -8918,6 +8918,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
        }
    }

+    for (ggml_type type_a : { GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0 }) {
+        for (int dim : { 0, 1, 2, 3, }) {
+            test_cases.emplace_back(new test_concat(type_a, {128, 12, 13, 14}, dim == 0 ? 256 : 7, dim, 0));
+        }
+    }
+
    for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
        for (uint32_t i = 4; i <= 1024*1024; i *= 2) {
            test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {i-1, 1, 1, 1}));
@@ -69,7 +69,6 @@ export const SETTINGS_KEYS = {
 	// Developer
 	DISABLE_REASONING_PARSING: 'disableReasoningParsing',
 	EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',
-	ENABLE_THINKING: 'enableThinking',
 	SHOW_RAW_OUTPUT_SWITCH: 'showRawOutputSwitch',
 	// PY_INTERPRETER_ENABLED: 'pyInterpreterEnabled',
 	JS_SANDBOX_ENABLED: 'jsSandboxEnabled',
@@ -185,7 +185,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				defaultValue: false,
 				type: SettingsFieldType.CHECKBOX,
 				section: SETTINGS_SECTION_SLUGS.GENERAL,
-				isExperimental: true
+				isExperimental: true,
+				sync: {
+					serverKey: SETTINGS_KEYS.TITLE_GENERATION_USE_LLM,
+					paramType: SyncableParameterType.BOOLEAN
+				}
 			},
 			{
 				key: SETTINGS_KEYS.TITLE_GENERATION_PROMPT,
@@ -193,7 +197,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'Optional template for the title generation prompt. Use {{USER}} for the user message and {{ASSISTANT}} for the assistant message.',
 				defaultValue: TITLE_GENERATION.DEFAULT_PROMPT,
 				type: SettingsFieldType.TEXTAREA,
-				section: SETTINGS_SECTION_SLUGS.GENERAL
+				section: SETTINGS_SECTION_SLUGS.GENERAL,
+				sync: {
+					serverKey: SETTINGS_KEYS.TITLE_GENERATION_PROMPT,
+					paramType: SyncableParameterType.STRING
+				}
 			},
 			{
 				key: SETTINGS_KEYS.MAX_IMAGE_RESOLUTION,
@@ -201,7 +209,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'Images larger than this will be resized before sending to server. Set to 0 to disable.',
 				defaultValue: 0,
 				type: SettingsFieldType.INPUT,
-				section: SETTINGS_SECTION_SLUGS.GENERAL
+				section: SETTINGS_SECTION_SLUGS.GENERAL,
+				sync: {
+					serverKey: SETTINGS_KEYS.MAX_IMAGE_RESOLUTION,
+					paramType: SyncableParameterType.NUMBER
+				}
 			}
 		]
 	},
@@ -385,7 +397,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'Display the current build version in the bottom-right corner of the interface.',
 				defaultValue: false,
 				type: SettingsFieldType.CHECKBOX,
-				section: SETTINGS_SECTION_SLUGS.DISPLAY
+				section: SETTINGS_SECTION_SLUGS.DISPLAY,
+				sync: {
+					serverKey: SETTINGS_KEYS.SHOW_BUILD_VERSION,
+					paramType: SyncableParameterType.BOOLEAN
+				}
 			}
 		]
 	},
@@ -669,7 +685,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'After each response, re-submit the conversation to pre-fill the server KV cache. Makes the next turn faster since the prompt is already encoded while you read the response.',
 				defaultValue: false,
 				type: SettingsFieldType.CHECKBOX,
-				section: SETTINGS_SECTION_SLUGS.DEVELOPER
+				section: SETTINGS_SECTION_SLUGS.DEVELOPER,
+				sync: {
+					serverKey: SETTINGS_KEYS.PRE_ENCODE_CONVERSATION,
+					paramType: SyncableParameterType.BOOLEAN
+				}
 			},
 			{
 				key: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
@@ -677,7 +697,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'Send reasoning_format=none so the server returns thinking tokens inline instead of extracting them into a separate field.',
 				defaultValue: false,
 				type: SettingsFieldType.CHECKBOX,
-				section: SETTINGS_SECTION_SLUGS.DEVELOPER
+				section: SETTINGS_SECTION_SLUGS.DEVELOPER,
+				sync: {
+					serverKey: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
+					paramType: SyncableParameterType.BOOLEAN
+				}
 			},
 			{
 				key: SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,
@@ -691,14 +715,6 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 					paramType: SyncableParameterType.BOOLEAN
 				}
 			},
-			{
-				key: SETTINGS_KEYS.ENABLE_THINKING,
-				label: 'Enable thinking',
-				help: 'Enable model thinking/reasoning for each request. When off, the model will skip the thinking phase and go straight to the response.',
-				defaultValue: false,
-				type: SettingsFieldType.CHECKBOX,
-				section: SETTINGS_SECTION_SLUGS.DEVELOPER
-			},
 			{
 				key: SETTINGS_KEYS.SHOW_RAW_OUTPUT_SWITCH,
 				label: 'Enable raw output toggle',
@@ -717,7 +733,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				help: 'Expose a run_javascript tool to the model. Code runs in a Web Worker inside a sandboxed iframe with an opaque origin, isolated from the WebUI and its API, with a hard timeout.',
 				defaultValue: false,
 				type: SettingsFieldType.CHECKBOX,
-				section: SETTINGS_SECTION_SLUGS.DEVELOPER
+				section: SETTINGS_SECTION_SLUGS.DEVELOPER,
+				sync: {
+					serverKey: SETTINGS_KEYS.JS_SANDBOX_ENABLED,
+					paramType: SyncableParameterType.BOOLEAN
+				}
 			},
 			{
 				key: SETTINGS_KEYS.CUSTOM_JSON,
@@ -753,7 +773,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
 				defaultValue: DEFAULT_MCP_CONFIG.requestTimeoutSeconds,
 				type: SettingsFieldType.INPUT,
 				section: SETTINGS_SECTION_SLUGS.MCP,
-				isPositiveInteger: true
+				isPositiveInteger: true,
+				sync: {
+					serverKey: SETTINGS_KEYS.MCP_REQUEST_TIMEOUT_SECONDS,
+					paramType: SyncableParameterType.NUMBER
+				}
 			}
 		]
 	}