Compare commits

...

1 Commits
b9480 ... b9481

Author SHA1 Message Date
Hans Florian
bfb4308b05 model : support granite multilingual embeddings R2 (ibm-granite/granite-embedding-{97,311}m-multilingual-r2) (#22716)
* Add support for the ibm-granite/granite-embedding-{97m,311m}-multilingual-r2 embedding models:

* Added a version of the gpt4o tokenizer that has a fixed regex (better handling of marks), and different token merging setting for the 97m model
* Reused gemma4 tokenizer for the 311m model

* granite-embedding-*-multilingual-r2 : add support SwiGLU FFN for Granite Embedding Multilingual R2

* added new GGUF key <arch>.hidden_activation (LLM_KV_HIDDEN_ACT) + writer
* added a forward declaration of llm_ffn_op_type to llama-hparams.h
* added llm_ffn_op in hparams
* added LLM_FFN_NONE = 0 sentinel to llm_ffn_op_type (value-initialization), modern-bert: explicitly assigns LLM_FFN_GEGLU before reading GGUF (unchanged).
* centralized hidden_act mapping in llama-model.cpp, added llm_ffn_op_type_from_string() helper, mirroring rope_scaling_type/llama_rope_scaling_type_from_string()
* modern-bert reads the GGUF key (when present) and uses the resulting op in its FFN graph

* Added granite-embedding-{97m,311m}-multilingual-r2 to the converter code

* Added the hashes for the granite embedding multilingual R2 models
* Set the hidden_activation in the GGUF if the field is present in config.json (such as for the granite embedding models)
2026-06-02 17:55:11 +02:00
14 changed files with 140 additions and 57 deletions

View File

@@ -1657,6 +1657,12 @@ class TextModel(ModelBase):
if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
# ref: https://huggingface.co/openbmb/MiniCPM5-1B
res = "minicpm5"
if chkhsh == "f241072145675bf8322086f115aebad05e9f869557a238bf2150a2a417d1bf60":
# ref: https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2
res = "granite-embed-multi-97m"
if chkhsh == "789696f5946cc0fc59371f39f6097cafed196b3acded6140432f26bbb1ae1669":
# ref: https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2
res = "granite-embed-multi-311m"
if res is None:
logger.warning("\n")

View File

@@ -603,6 +603,12 @@ class ModernBertModel(BertModel):
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
# FFN activation: ModernBert uses a GLU pair (ffn_up output is 2*n_ff). The
# original ModernBERT uses GELU (-> GeGLU); some derivatives such as IBM
# Granite Embedding 97m R2 use SiLU (-> SwiGLU). Persist this so the
# llama.cpp graph can pick the matching activation.
if hidden_act := self.hparams.get("hidden_activation"):
self.gguf_writer.add_hidden_act(hidden_act)
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:

View File

@@ -158,6 +158,8 @@ models = [
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
{"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
{"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
{"name": "granite-embed-multi-97m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2", },
{"name": "granite-embed-multi-311m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2", },
]
# some models are known to be broken upstream, so we will skip them as exceptions

View File

@@ -150,6 +150,7 @@ class Keys:
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
SWIGLU_CLAMP_EXP = "{arch}.swiglu_clamp_exp"
SWIGLU_CLAMP_SHEXP = "{arch}.swiglu_clamp_shexp"
HIDDEN_ACT = "{arch}.hidden_activation"
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"

View File

@@ -853,6 +853,9 @@ class GGUFWriter:
def add_swiglu_clamp_shexp(self, values: Sequence[float]) -> None:
self.add_array(Keys.LLM.SWIGLU_CLAMP_SHEXP.format(arch=self.arch), values)
def add_hidden_act(self, value: str) -> None:
self.add_string(Keys.LLM.HIDDEN_ACT.format(arch=self.arch), value)
def add_expert_group_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.EXPERT_GROUP_SCALE.format(arch=self.arch), value)

View File

@@ -195,6 +195,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" },
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
{ LLM_KV_HIDDEN_ACT, "%s.hidden_activation" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },

View File

@@ -199,6 +199,7 @@ enum llm_kv {
LLM_KV_MOE_LATENT_SIZE,
LLM_KV_NEXTN_PREDICT_LAYERS,
LLM_KV_NUM_DEEPSTACK_LAYERS,
LLM_KV_HIDDEN_ACT,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,

View File

@@ -36,7 +36,8 @@ enum llm_graph_type {
LLM_GRAPH_TYPE_DECODER_MTP,
};
enum llm_ffn_op_type {
enum llm_ffn_op_type : int {
LLM_FFN_NONE = 0, // sentinel: unset; archs must assign before use
LLM_FFN_SILU,
LLM_FFN_GELU,
LLM_FFN_RELU,

View File

@@ -23,6 +23,9 @@ enum llama_swa_type {
LLAMA_SWA_TYPE_SYMMETRIC = 3,
};
// forward declaration; full definition in llama-graph.h
enum llm_ffn_op_type : int;
struct llama_hparams_posnet {
uint32_t n_embd;
uint32_t n_layer;
@@ -227,6 +230,14 @@ struct llama_hparams {
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
// Resolved FFN gated activation flavor for archs that read
// `<arch>.hidden_activation` from the GGUF (e.g. ModernBert derivatives).
// Defaults to LLM_FFN_NONE (sentinel = 0); the mapping from the GGUF
// string to a real op is done at hparam-load time via
// llm_ffn_op_type_from_string() in llama-model.cpp, mirroring how
// rope_scaling_type_train is handled.
enum llm_ffn_op_type llm_ffn_op;
// Step35: optional per-layer clamps for (Swi)GLU
std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_exp; // clamping for expert FFN
std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_shexp; // shared expert

View File

@@ -822,6 +822,28 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
}
// Maps the GGUF `<arch>.hidden_activation` string to the FFN op type used by the
// graph builders. Only gated activations that map cleanly to llm_ffn_op_type are
// listed; unrecognized values fall back to GeGLU, which matches the historical
// default for ModernBert-style architectures.
static const std::map<std::string, llm_ffn_op_type> LLM_FFN_OP_TYPES_FROM_STRING = {
{ "gelu", LLM_FFN_GEGLU },
{ "geglu", LLM_FFN_GEGLU },
{ "silu", LLM_FFN_SWIGLU },
{ "swish", LLM_FFN_SWIGLU },
{ "swiglu", LLM_FFN_SWIGLU },
{ "relu", LLM_FFN_RELU },
{ "reglu", LLM_FFN_REGLU },
};
llm_ffn_op_type llm_ffn_op_type_from_string(const std::string & name, llm_ffn_op_type fallback) {
const auto it = LLM_FFN_OP_TYPES_FROM_STRING.find(name);
if (it != LLM_FFN_OP_TYPES_FROM_STRING.end()) {
return it->second;
}
return fallback;
}
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
static buft_list_t make_cpu_buft_list(const std::vector<llama_device> & devices, bool use_extra_bufts, bool no_host) {
buft_list_t buft_list;

View File

@@ -145,6 +145,10 @@ enum llm_type {
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
// Map a GGUF activation-name string to llm_ffn_op_type. Returns `fallback` if
// the string is empty or not recognized.
llm_ffn_op_type llm_ffn_op_type_from_string(const std::string & name, llm_ffn_op_type fallback);
struct llama_layer_posnet {
// resnet
struct ggml_tensor * norm1 = nullptr;

View File

@@ -432,6 +432,15 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI:
// Same lookaheads as GPT4O but with \p{M} added so combining marks
// (diacritics) attach to their base letters. Avoids excessive
// backtracking on scripts that use them heavily (Bengali, Hindi,
// Telugu, Thai, ...). See PR #22716 for benchmarks.
regex_exprs = {
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}\\p{M}])([^a-z]))*((?=[\\p{L}\\p{M}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}\\p{M}])([^a-z]))+((?=[\\p{L}\\p{M}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_TINY_AYA:
regex_exprs = {
// original regex from tokenizer.json: "\\d{1,3}(?=(?:\\d{3})*\\b)"
@@ -2142,7 +2151,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "jais-2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
} else if (
tokenizer_pre == "gemma4") {
tokenizer_pre == "gemma4" ||
tokenizer_pre == "granite-embed-multi-311m") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
escape_whitespaces = true;
} else if (
@@ -2252,6 +2262,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "talkie") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
clean_spaces = false;
} else if (
tokenizer_pre == "granite-embed-multi-97m") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI;
clean_spaces = false;
ignore_merges = true;
} else if (
tokenizer_pre == "tiny_aya") {
pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA;

View File

@@ -8,60 +8,61 @@
// pre-tokenization types
enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53,
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53,
LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI = 54,
};
struct LLM_KV;

View File

@@ -14,6 +14,14 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
// Some ModernBert derivatives (e.g. IBM Granite Embedding 97m R2) use
// SiLU/SwiGLU in the FFN instead of the default GELU/GeGLU.
hparams.llm_ffn_op = LLM_FFN_GEGLU;
std::string hidden_act;
if (ml.get_key(LLM_KV_HIDDEN_ACT, hidden_act, false)) {
hparams.llm_ffn_op = llm_ffn_op_type_from_string(hidden_act, LLM_FFN_GEGLU);
}
switch (hparams.n_layer) {
case 12:
type = LLM_TYPE_47M; break; // granite-embedding-small
@@ -144,7 +152,8 @@ llama_model_modern_bert::graph::graph(const llama_model & model, const llm_graph
NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
hparams.llm_ffn_op,
LLM_FFN_SEQ, il);
// attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp);