Compare commits

...

2 Commits
b9510 ... b9512

Author SHA1 Message Date
forforever73
0dbfa66a1f return filter to save memory (#24125)
Co-authored-by: lvyichen <lvyichen@stepfun.com>
2026-06-04 15:56:33 +02:00
Pedro Cuenca
e8023568d0 convert: Fix Gemma 4 Unified conversion (#24118)
* Fix Gemma 4 Unified conversion

* Set audio hidden size to audio_embed_dim
2026-06-04 15:21:38 +02:00
2 changed files with 20 additions and 4 deletions

View File

@@ -798,7 +798,8 @@ class Gemma4VisionAudioModel(MmprojModel):
# remap audio hparams
if self.hparams_audio:
self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128)
self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
if "hidden_size" in self.hparams_audio:
self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
else:
self.has_audio_encoder = False
@@ -872,7 +873,7 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
assert self.hparams_audio is not None
text_embd_dim = self.hparams_vision["mm_embed_dim"]
self.hparams_vision["hidden_size"] = text_embd_dim
self.hparams_audio["hidden_size"] = text_embd_dim
self.hparams_audio["hidden_size"] = self.hparams_audio["audio_embed_dim"]
# this is a transformer-less vision tower, the params below are redundant but set to avoid error
self.hparams_vision["intermediate_size"] = 0
self.hparams_vision["num_layers"] = 0
@@ -897,7 +898,10 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
# ggml im2col outputs in RR..GG..BB.. (CHW) order, but weight expects RGBRGB.. (HWC).
# Permute columns so column i aligns with CHW input position i.
assert self.hparams_vision is not None
p = self.hparams_vision["model_patch_size"]
if "model_patch_size" in self.hparams_vision:
p = self.hparams_vision["model_patch_size"]
else:
p = self.hparams_vision["patch_size"] * self.hparams_vision["pooling_kernel_size"]
i = torch.arange(p * p * 3)
ch = i // (p * p)
row = (i % (p * p)) // p
@@ -908,7 +912,10 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
elif "patch_ln1.weight" in name or "patch_ln1.bias" in name:
# same permutation for patch_ln1 as patch_dense to align with CHW input order
assert self.hparams_vision is not None
p = self.hparams_vision["model_patch_size"]
if "model_patch_size" in self.hparams_vision:
p = self.hparams_vision["model_patch_size"]
else:
p = self.hparams_vision["patch_size"] * self.hparams_vision["pooling_kernel_size"]
i = torch.arange(p * p * 3)
ch = i // (p * p)
row = (i % (p * p)) // p

View File

@@ -2112,6 +2112,15 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
}
if (arch == LLM_ARCH_STEP35 && hparams.nextn_predict_layers > 0) {
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP) {
filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
} else {
filter = [n_main](int32_t il) { return (uint32_t)il < n_main; };
}
}
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
GGML_ASSERT(hparams.is_swa_any());