Compare commits

...

3 Commits

Author SHA1 Message Date
liminfei-amd a4107133a6 llama : add guard for K/V rotation input when buffer is unallocated (#25215)
llm_graph_input_attn_kv::set_input and llm_graph_input_attn_kv_iswa::set_input
call set_input_k_rot / set_input_v_rot whenever the rotation tensor pointer is
non-null, but the tensor's buffer can be unallocated (NULL) when a graph only
stores K/V without attending -- e.g. DFlash speculative decoding's KV-injection
pass. set_input_k_rot then calls ggml_backend_buffer_is_host() on a NULL buffer
and aborts with GGML_ASSERT(buffer).

Guard the four k_rot/v_rot inputs with the same "&& ->buffer" check that the
adjacent kq_mask inputs already use in these two functions. When the buffer is
unallocated there is no data to upload, so skipping is correct.

Fixes #25191

Signed-off-by: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com>
2026-07-04 22:37:38 +02:00
Pascal 665892536d ui: add sync blocks so display/behavior settings can be set via --ui-config-file (#25132)
* ui: add sync blocks so display/behavior settings can be set via --ui-config-file

* ui: remove enable thinking setting
2026-07-04 16:12:27 +02:00
fairydreaming ef2d770117 ggml : fix broken CPU concat implementation for quantized types (#25247)
* ggml : fix broken CPU concat implementation for quantized types

* tests : concat tests for quantized types

---------

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2026-07-04 13:37:37 +02:00
5 changed files with 67 additions and 26 deletions
+15 -3
View File
@@ -1913,7 +1913,11 @@ static void ggml_compute_forward_concat_any(
GGML_ASSERT(dim >= 0 && dim < 4);
int64_t o[4] = {0, 0, 0, 0};
o[dim] = src0->ne[dim];
if (dim == 0) {
o[dim] = src0->ne[dim]/ggml_blck_size(src0->type);
} else {
o[dim] = src0->ne[dim];
}
const char * x;
@@ -1921,8 +1925,8 @@ static void ggml_compute_forward_concat_any(
for (int i3 = 0; i3 < ne3; i3++) {
for (int i2 = ith; i2 < ne2; i2 += nth) {
for (int i1 = 0; i1 < ne1; i1++) {
for (int i0 = 0; i0 < ne0; i0++) {
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
for (int i0 = 0; i0 < ne0/ggml_blck_size(dst->type); i0++) {
if (i0 < ne00/ggml_blck_size(src0->type) && i1 < ne01 && i2 < ne02 && i3 < ne03) {
x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03;
} else {
x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13;
@@ -2071,6 +2075,14 @@ void ggml_compute_forward_concat(
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
if (ggml_is_quantized(src0->type)) {
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(src0->ne[0] % ggml_blck_size(src0->type) == 0);
GGML_ASSERT(src1->ne[0] % ggml_blck_size(src1->type) == 0);
}
switch (src0->type) {
case GGML_TYPE_F16:
+6 -6
View File
@@ -494,11 +494,11 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
}
if (self_k_rot) {
if (self_k_rot && self_k_rot->buffer) {
mctx->set_input_k_rot(self_k_rot);
}
if (self_v_rot) {
if (self_v_rot && self_v_rot->buffer) {
mctx->set_input_v_rot(self_v_rot);
}
}
@@ -592,19 +592,19 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
}
if (self_k_rot) {
if (self_k_rot && self_k_rot->buffer) {
mctx->get_base()->set_input_k_rot(self_k_rot);
}
if (self_v_rot) {
if (self_v_rot && self_v_rot->buffer) {
mctx->get_base()->set_input_v_rot(self_v_rot);
}
if (self_k_rot_swa) {
if (self_k_rot_swa && self_k_rot_swa->buffer) {
mctx->get_swa()->set_input_k_rot(self_k_rot_swa);
}
if (self_v_rot_swa) {
if (self_v_rot_swa && self_v_rot_swa->buffer) {
mctx->get_swa()->set_input_v_rot(self_v_rot_swa);
}
}
+6
View File
@@ -8918,6 +8918,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
}
}
for (ggml_type type_a : { GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0 }) {
for (int dim : { 0, 1, 2, 3, }) {
test_cases.emplace_back(new test_concat(type_a, {128, 12, 13, 14}, dim == 0 ? 256 : 7, dim, 0));
}
}
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
for (uint32_t i = 4; i <= 1024*1024; i *= 2) {
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {i-1, 1, 1, 1}));
@@ -69,7 +69,6 @@ export const SETTINGS_KEYS = {
// Developer
DISABLE_REASONING_PARSING: 'disableReasoningParsing',
EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',
ENABLE_THINKING: 'enableThinking',
SHOW_RAW_OUTPUT_SWITCH: 'showRawOutputSwitch',
// PY_INTERPRETER_ENABLED: 'pyInterpreterEnabled',
JS_SANDBOX_ENABLED: 'jsSandboxEnabled',
+40 -16
View File
@@ -185,7 +185,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
defaultValue: false,
type: SettingsFieldType.CHECKBOX,
section: SETTINGS_SECTION_SLUGS.GENERAL,
isExperimental: true
isExperimental: true,
sync: {
serverKey: SETTINGS_KEYS.TITLE_GENERATION_USE_LLM,
paramType: SyncableParameterType.BOOLEAN
}
},
{
key: SETTINGS_KEYS.TITLE_GENERATION_PROMPT,
@@ -193,7 +197,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
help: 'Optional template for the title generation prompt. Use {{USER}} for the user message and {{ASSISTANT}} for the assistant message.',
defaultValue: TITLE_GENERATION.DEFAULT_PROMPT,
type: SettingsFieldType.TEXTAREA,
section: SETTINGS_SECTION_SLUGS.GENERAL
section: SETTINGS_SECTION_SLUGS.GENERAL,
sync: {
serverKey: SETTINGS_KEYS.TITLE_GENERATION_PROMPT,
paramType: SyncableParameterType.STRING
}
},
{
key: SETTINGS_KEYS.MAX_IMAGE_RESOLUTION,
@@ -201,7 +209,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
help: 'Images larger than this will be resized before sending to server. Set to 0 to disable.',
defaultValue: 0,
type: SettingsFieldType.INPUT,
section: SETTINGS_SECTION_SLUGS.GENERAL
section: SETTINGS_SECTION_SLUGS.GENERAL,
sync: {
serverKey: SETTINGS_KEYS.MAX_IMAGE_RESOLUTION,
paramType: SyncableParameterType.NUMBER
}
}
]
},
@@ -385,7 +397,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
help: 'Display the current build version in the bottom-right corner of the interface.',
defaultValue: false,
type: SettingsFieldType.CHECKBOX,
section: SETTINGS_SECTION_SLUGS.DISPLAY
section: SETTINGS_SECTION_SLUGS.DISPLAY,
sync: {
serverKey: SETTINGS_KEYS.SHOW_BUILD_VERSION,
paramType: SyncableParameterType.BOOLEAN
}
}
]
},
@@ -669,7 +685,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
help: 'After each response, re-submit the conversation to pre-fill the server KV cache. Makes the next turn faster since the prompt is already encoded while you read the response.',
defaultValue: false,
type: SettingsFieldType.CHECKBOX,
section: SETTINGS_SECTION_SLUGS.DEVELOPER
section: SETTINGS_SECTION_SLUGS.DEVELOPER,
sync: {
serverKey: SETTINGS_KEYS.PRE_ENCODE_CONVERSATION,
paramType: SyncableParameterType.BOOLEAN
}
},
{
key: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
@@ -677,7 +697,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
help: 'Send reasoning_format=none so the server returns thinking tokens inline instead of extracting them into a separate field.',
defaultValue: false,
type: SettingsFieldType.CHECKBOX,
section: SETTINGS_SECTION_SLUGS.DEVELOPER
section: SETTINGS_SECTION_SLUGS.DEVELOPER,
sync: {
serverKey: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
paramType: SyncableParameterType.BOOLEAN
}
},
{
key: SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,
@@ -691,14 +715,6 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
paramType: SyncableParameterType.BOOLEAN
}
},
{
key: SETTINGS_KEYS.ENABLE_THINKING,
label: 'Enable thinking',
help: 'Enable model thinking/reasoning for each request. When off, the model will skip the thinking phase and go straight to the response.',
defaultValue: false,
type: SettingsFieldType.CHECKBOX,
section: SETTINGS_SECTION_SLUGS.DEVELOPER
},
{
key: SETTINGS_KEYS.SHOW_RAW_OUTPUT_SWITCH,
label: 'Enable raw output toggle',
@@ -717,7 +733,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
help: 'Expose a run_javascript tool to the model. Code runs in a Web Worker inside a sandboxed iframe with an opaque origin, isolated from the WebUI and its API, with a hard timeout.',
defaultValue: false,
type: SettingsFieldType.CHECKBOX,
section: SETTINGS_SECTION_SLUGS.DEVELOPER
section: SETTINGS_SECTION_SLUGS.DEVELOPER,
sync: {
serverKey: SETTINGS_KEYS.JS_SANDBOX_ENABLED,
paramType: SyncableParameterType.BOOLEAN
}
},
{
key: SETTINGS_KEYS.CUSTOM_JSON,
@@ -753,7 +773,11 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
defaultValue: DEFAULT_MCP_CONFIG.requestTimeoutSeconds,
type: SettingsFieldType.INPUT,
section: SETTINGS_SECTION_SLUGS.MCP,
isPositiveInteger: true
isPositiveInteger: true,
sync: {
serverKey: SETTINGS_KEYS.MCP_REQUEST_TIMEOUT_SECONDS,
paramType: SyncableParameterType.NUMBER
}
}
]
}