kv-cache: follow the source cache size when sharing cells (#24267 )

A fitted target context can end up smaller than the draft default, the oversized assistant views then overflow the shared K/V tensors and trip the ggml_view_4d size assert during graph reserve.
2026-06-07 19:43:00 +02:00 · 2026-06-07 18:33:00 +03:00
1 changed files with 11 additions and 0 deletions
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -97,6 +97,17 @@ llama_kv_cache::llama_kv_cache(
    model(model), hparams(hparams), v_trans(v_trans),
    n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {

+    // shared cells view the source cache's K/V tensors, so the cell count
+    // follows the source allocation: a fitted target can be smaller than the
+    // draft default and oversized views would overflow the source tensors
+    if (mem_other) {
+        const uint32_t size_other = static_cast<llama_kv_cache *>(mem_other)->get_size();
+        if (kv_size != size_other) {
+            LLAMA_LOG_WARN("%s: kv_size = %u overridden to %u to match the shared source cache\n", __func__, kv_size, size_other);
+            kv_size = size_other;
+        }
+    }
+
    GGML_ASSERT(kv_size % n_pad == 0);

    const uint32_t n_layer = hparams.n_layer_all;