Compare commits

...

2 Commits
b9010 ... b9012

Author SHA1 Message Date
Julien Denize
048a490f76 convert : Mistral format yarn apply_scale support (#22612)
* [BUGFIX] Mistral format apply_scale support.

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* fix misunderstood boolean parameters

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-05-03 21:51:21 +02:00
JM Robles
db44417b02 convert : apply Q/K RoPE permutation in NVFP4 repack path (#22611)
Llama-architecture q_proj/k_proj weights need an axis-0 row permutation
to match GGML's RoPE convention. The BF16 path applies this in
LlamaModel.modify_tensors via LlamaModel.permute, but the NVFP4 path
bypasses modify_tensors and writes weights directly through
ModelBase._repack_nvfp4. Without the permutation, attention heads end
up scrambled at inference and the model produces gibberish.

This change overrides _repack_nvfp4 on LlamaModel and applies the same
permutation to both the nibble-packed weight and the per-block scale
before delegating to ModelBase._repack_nvfp4 via super(). Reuses the
existing LlamaModel.permute static helper and respects the existing
undo_permute flag, so subclasses (Mistral, Granite, Llama4, etc.)
inherit the fix automatically.

Verified on TinyLlama-1.1B reproducer: perplexity drops from 4419
(gibberish) to 43.9, matching the BF16-dequantized baseline (44.0).
Also verified end-to-end on ALIA-40b-instruct-2601 (BSC, Llama
architecture) with multilingual generation in Spanish/Catalan/Basque/
Galician all coherent with the fix applied.

Co-authored-by: Chema <chema@montevive.ai>
2026-05-03 18:22:00 +03:00
2 changed files with 18 additions and 3 deletions

View File

@@ -2889,6 +2889,20 @@ class LlamaModel(TextModel):
.swapaxes(1, 2)
.reshape(weights.shape))
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
# Mirror the BF16 Q/K RoPE permutation site in modify_tensors; the NVFP4 path bypasses it.
if self.undo_permute:
n_head = self.find_hparam(["n_heads", "num_attention_heads"], optional=True)
n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"], optional=True)
if n_head is not None:
if name.endswith("q_proj.weight"):
weight = LlamaModel.permute(weight, n_head, n_head)
scale = LlamaModel.permute(scale, n_head, n_head)
elif name.endswith("k_proj.weight"):
weight = LlamaModel.permute(weight, n_head, n_kv_head)
scale = LlamaModel.permute(scale, n_head, n_kv_head)
super()._repack_nvfp4(name, weight, scale, scale2, input_scale)
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -12702,11 +12716,12 @@ class MistralModel(LlamaModel):
def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict):
if "yarn" in hparams:
yarn_params = hparams["yarn"]
mscale_all_dim = 1.0 if not yarn_params["apply_scale"] else 0.0
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
gguf_writer.add_rope_scaling_yarn_log_mul(mscale_all_dim)
gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
if "llama_4_scaling" in hparams:

View File

@@ -1994,7 +1994,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
}
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false)) {
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
// cancel the factor from the convert script
hparams.rope_yarn_log_mul /= 0.1f;
@@ -2868,7 +2868,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
hparams.f_attn_temp_offset = 0.0f;