vocab : JetBrains Mellum pre-tokenizer (#15045 )

model : add text-only support for Kimi-VL (and find special tokens in text_config) (#15051 )
* basic kimi-vl textmodel conversion * check config["text_config"] for special tokens
2026-04-30 06:44:18 +02:00 · 2025-08-03 21:38:18 +02:00 · 2025-08-03 16:56:25 +02:00
4 changed files with 19 additions and 2 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -852,6 +852,9 @@ class TextModel(ModelBase):
        if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
            res = "exaone4"
+        if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
+            # ref: https://huggingface.co/JetBrains/Mellum-4b-base
+            res = "mellum"

        if res is None:
            logger.warning("\n")
@@ -6059,6 +6062,7 @@ class DeepseekModel(TextModel):

@ModelBase.register("DeepseekV2ForCausalLM")
@ModelBase.register("DeepseekV3ForCausalLM")
+@ModelBase.register("KimiVLForConditionalGeneration")
 class DeepseekV2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.DEEPSEEK2

@@ -6161,6 +6165,13 @@ class DeepseekV2Model(TextModel):
    _experts: list[dict[str, Tensor]] | None = None

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # skip vision tensors and remove "language_model." for Kimi-VL
+        if "vision_tower" in name or "multi_modal_projector" in name:
+            return []
+
+        if name.startswith("language_model."):
+            name = name.replace("language_model.", "")
+
        # rename e_score_correction_bias tensors
        if name.endswith("e_score_correction_bias"):
            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -138,6 +138,7 @@ models = [
    {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
+    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -312,7 +312,11 @@ class SpecialVocab:
        with open(config_file, encoding = 'utf-8') as f:
            config = json.load(f)
        for typ in self.special_token_types:
-            self._set_special_token(typ, config.get(f'{typ}_token_id'))
+            token_id = config.get(f'{typ}_token_id')
+            # If not found at root, check in text_config (for multimodal models like Kimi-VL)
+            if token_id is None and 'text_config' in config:
+                token_id = config['text_config'].get(f'{typ}_token_id')
+            self._set_special_token(typ, token_id)
        return True


--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1856,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "gigachat"   ||
                    tokenizer_pre == "jina-v2-es" ||
                    tokenizer_pre == "jina-v2-de" ||
-                    tokenizer_pre == "a.x-4.0") {
+                    tokenizer_pre == "a.x-4.0" ||
+                    tokenizer_pre == "mellum") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
Author	SHA1	Message	Date
Csaba Kecskemeti	97366dc6ab	vocab : JetBrains Mellum pre-tokenizer (#15045 )	2025-08-03 21:38:18 +02:00
Gabriel Larson	83bc2f288c	model : add text-only support for Kimi-VL (and find special tokens in text_config) (#15051 ) * basic kimi-vl textmodel conversion * check config["text_config"] for special tokens	2025-08-03 16:56:25 +02:00