imatrix : fix 3d activation handling for hybrid and recurrent models (#14994 )

* imatrix : use a single count for dense 3d tensors * imatrix : fix 3d activations when model tensor is 2d * imatrix : fix 3d tensor counts
memory : handle kv_unified for hybrid models (#15050 )
2026-04-30 14:54:19 +02:00 · 2025-08-03 21:49:13 +02:00 · 2025-08-03 21:43:07 +02:00 · 2025-08-03 21:38:18 +02:00 · 2025-08-03 16:56:25 +02:00
8 changed files with 63 additions and 29 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -852,6 +852,9 @@ class TextModel(ModelBase):
        if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
            res = "exaone4"
+        if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
+            # ref: https://huggingface.co/JetBrains/Mellum-4b-base
+            res = "mellum"

        if res is None:
            logger.warning("\n")
@@ -6059,6 +6062,7 @@ class DeepseekModel(TextModel):

@ModelBase.register("DeepseekV2ForCausalLM")
@ModelBase.register("DeepseekV3ForCausalLM")
+@ModelBase.register("KimiVLForConditionalGeneration")
 class DeepseekV2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.DEEPSEEK2

@@ -6161,6 +6165,13 @@ class DeepseekV2Model(TextModel):
    _experts: list[dict[str, Tensor]] | None = None

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # skip vision tensors and remove "language_model." for Kimi-VL
+        if "vision_tower" in name or "multi_modal_projector" in name:
+            return []
+
+        if name.startswith("language_model."):
+            name = name.replace("language_model.", "")
+
        # rename e_score_correction_bias tensors
        if name.endswith("e_score_correction_bias"):
            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -138,6 +138,7 @@ models = [
    {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
+    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -312,7 +312,11 @@ class SpecialVocab:
        with open(config_file, encoding = 'utf-8') as f:
            config = json.load(f)
        for typ in self.special_token_types:
-            self._set_special_token(typ, config.get(f'{typ}_token_id'))
+            token_id = config.get(f'{typ}_token_id')
+            # If not found at root, check in text_config (for multimodal models like Kimi-VL)
+            if token_id is None and 'text_config' in config:
+                token_id = config['text_config'].get(f'{typ}_token_id')
+            self._set_special_token(typ, token_id)
        return True


--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
                         /* common */
             uint32_t    n_seq_max,
                 bool    offload,
+                 bool    unified,
                         /* layer filters */
      layer_filter_cb && filter_attn,
      layer_filter_cb && filter_recr) :
@@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
        type_v,
        v_trans,
        offload,
-        1,
+        unified,
        kv_size,
        n_seq_max,
        n_pad,
--- a/src/llama-memory-hybrid.h
+++ b/src/llama-memory-hybrid.h
@@ -39,6 +39,7 @@ public:
                             /* common */
                 uint32_t    n_seq_max,
                     bool    offload,
+                     bool    unified,
                             /* layer filters */
          layer_filter_cb && filter_attn = nullptr,
          layer_filter_cb && filter_recr = nullptr);
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -17598,6 +17598,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                        /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
                        /* n_seq_max         */ cparams.n_seq_max,
                        /* offload           */ cparams.offload_kqv,
+                        /* unified           */ cparams.kv_unified,
                        /* filter_attn       */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
                        /* filter_recr       */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
                } else {
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1856,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "gigachat"   ||
                    tokenizer_pre == "jina-v2-es" ||
                    tokenizer_pre == "jina-v2-de" ||
-                    tokenizer_pre == "a.x-4.0") {
+                    tokenizer_pre == "a.x-4.0" ||
+                    tokenizer_pre == "mellum") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@@ -250,13 +250,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
    GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));

-    // TODO: 4d? (is that even used in practice?)
-    // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
-    if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
-        LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
-        GGML_ASSERT(false);
-    }
-
    // this has been adapted to the new format of storing merged experts in a single 3d tensor
    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
@@ -272,6 +265,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

        GGML_ASSERT(ids->ne[1] == src1->ne[2]);

+        // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
+        if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
+            LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
+            GGML_ASSERT(false);
+        }
+
        m_ids.resize(ggml_nbytes(ids));
        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));

@@ -335,29 +334,40 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        }
    } else {
        auto & e = m_stats[wname];
-        const int64_t n_mat = src1->ne[2] * src1->ne[3];
+        const int64_t n_mat = src0->ne[2] * src0->ne[3];

+        // use a single count per dense tensor
+        // (necessary when merging older GGUF-imatrix files with 3d tensors)
+        if (e.counts.size() > 1) {
+            bool all_equal = true;
+            for (size_t i = 1; i < e.counts.size(); ++i) {
+                if (e.counts[0] != e.counts[i]) {
+                    all_equal = false;
+                    break;
+                }
+            }
+            if (all_equal) {
+                e.counts.resize(1);
+            }
+        }
        if (e.values.empty()) {
            e.values.resize(src1->ne[0] * n_mat, 0);
-            e.counts.resize(n_mat, 0);
+            e.counts.resize(1, 0);
        }
        else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
            exit(1); //GGML_ABORT("fatal error");
        }
-        else if (e.counts.size() != (size_t)n_mat) {
-            LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat);
-            exit(1); //GGML_ABORT("fatal error");
-        }
        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
+
        for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
            for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
-                const int64_t mat_id = i3 * src1->ne[2] + i2;
+                // handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D
+                const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]);
                const int64_t mat_start = mat_id * src1->ne[0];

                for (int64_t row = 0; row < src1->ne[1]; ++row) {
-                    const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]);
-                    e.counts[mat_id]++;
+                    const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]);
                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
                        e.values[mat_start + j] += x[j] * x[j];
                        if (!std::isfinite((float)e.values[j])) {
@@ -366,16 +376,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                        }
                    }
                }
-                const int32_t n_chunk = e.counts[mat_id] / chunk_size;
-                if (n_chunk > m_last_chunk) {
-                    const int32_t chunk_step = n_chunk - m_last_chunk;
-                    m_last_chunk = n_chunk;
-                    if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
-                        save_imatrix();
-                    }
-                    if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
-                        save_imatrix(m_last_chunk);
-                    }
+            }
+        }
+        // only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT
+        for (size_t i = 0; i < e.counts.size(); ++i) {
+            e.counts[i] += ggml_nrows(src1) / n_mat;
+            const int32_t n_chunk = e.counts[i] / chunk_size;
+            if (n_chunk > m_last_chunk) {
+                const int32_t chunk_step = n_chunk - m_last_chunk;
+                m_last_chunk = n_chunk;
+                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
+                    save_imatrix();
+                }
+                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
+                    save_imatrix(m_last_chunk);
                }
            }
        }
Author	SHA1	Message	Date
compilade	0a2f5496be	imatrix : fix 3d activation handling for hybrid and recurrent models (#14994 ) * imatrix : use a single count for dense 3d tensors * imatrix : fix 3d activations when model tensor is 2d * imatrix : fix 3d tensor counts	2025-08-03 21:49:13 +02:00
compilade	11a3811164	memory : handle kv_unified for hybrid models (#15050 )	2025-08-03 21:43:07 +02:00
Csaba Kecskemeti	97366dc6ab	vocab : JetBrains Mellum pre-tokenizer (#15045 )	2025-08-03 21:38:18 +02:00
Gabriel Larson	83bc2f288c	model : add text-only support for Kimi-VL (and find special tokens in text_config) (#15051 ) * basic kimi-vl textmodel conversion * check config["text_config"] for special tokens	2025-08-03 16:56:25 +02:00