Compare commits

..

3 Commits
b9565 ... b9568

Author SHA1 Message Date
Max Krasnyansky
7d2b45b4f7 mtp: support for gemma-4 E2B and E4B assistants (#24282)
* models: update converter to support smaller assistants

* models: add masked_embd tensors to gemma4-assist arch

* gemma-4: remove temp debug for conversion

* gemma-4-mtp: filter out masked_embedding tensors during conversion
2026-06-08 13:48:52 -07:00
Aldehir Rojas
42a0afd594 server : do not parse when flushing http headers (#24281) 2026-06-08 13:32:41 -05:00
Pascal
a66d50588b graph: guard iswa kq_mask on its own buffer (#24294)
A SWA-only draft head (e.g. StepFun MTP) leaves the base sub-cache
empty, so its kq_mask buffer stays null and asserts at load. Guard
each mask on its own buffer in set_input and can_reuse, base and swa.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-06-08 19:20:28 +02:00
8 changed files with 50 additions and 4 deletions

View File

@@ -789,6 +789,16 @@ class Gemma4UnifiedModel(Gemma4Model):
class Gemma4AssistantModel(Gemma4Model):
model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
if "masked_embedding" in name:
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
return None
return super().filter_tensors(item)
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"])

View File

@@ -538,6 +538,8 @@ class VISION_PROJECTOR_TYPE(IntEnum):
class MODEL_TENSOR(IntEnum):
TOKEN_EMBD = auto()
TOKEN_EMBD_NORM = auto()
MASKED_EMBD_CENTROIDS= auto()
MASKED_EMBD_ORDERING = auto()
TOKEN_TYPES = auto()
POS_EMBD = auto()
OUTPUT = auto()
@@ -1087,6 +1089,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
MODEL_TENSOR.TOKEN_TYPES: "token_types",
MODEL_TENSOR.MASKED_EMBD_CENTROIDS: "masked_embd_centroids",
MODEL_TENSOR.MASKED_EMBD_ORDERING: "masked_embd_ordering",
MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
@@ -2586,6 +2590,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_ARCH.GEMMA4_ASSISTANT: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.MASKED_EMBD_CENTROIDS,
MODEL_TENSOR.MASKED_EMBD_ORDERING,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.NEXTN_PROJ_PRE,
MODEL_TENSOR.NEXTN_PROJ_POST,

View File

@@ -37,6 +37,14 @@ class TensorNameMap:
"model.embed", # talkie
),
# Masked embeddings
MODEL_TENSOR.MASKED_EMBD_CENTROIDS: (
"masked_embedding.centroids", # gemma-4 E2B/E4B assistants
),
MODEL_TENSOR.MASKED_EMBD_ORDERING: (
"masked_embedding.token_ordering", # gemma-4 E2B/E4B assistants
),
# Token type embeddings
MODEL_TENSOR.TOKEN_TYPES: (
"embeddings.token_type_embeddings", # bert nomic-bert

View File

@@ -559,6 +559,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
{ LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" },
{ LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" },
};
// declare information about the model weight tensors:
@@ -783,6 +785,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
{LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
};
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

View File

@@ -566,8 +566,11 @@ enum llm_tensor {
LLM_TENSOR_NEXTN_HNORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
LLM_TENSOR_MASKED_EMBD_CENTROIDS,
LLM_TENSOR_MASKED_EMBD_ORDERING,
};
enum llm_tensor_layer {
LLM_TENSOR_LAYER_INPUT,
LLM_TENSOR_LAYER_REPEATING,

View File

@@ -567,7 +567,10 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
}
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
// the kq mask guards on its own buffer: shared cells leave idxs unbacked while the mask stays live
if (self_kq_mask && self_kq_mask->buffer) {
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
}
// swa tensors may not be allocated if there are no SWA attention layers
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
@@ -575,7 +578,9 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
}
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
}
if (self_k_rot) {
mctx->get_base()->set_input_k_rot(self_k_rot);
@@ -607,7 +612,9 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
}
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
if (self_kq_mask && self_kq_mask->buffer) {
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
}
// swa tensors may not be allocated if there are no SWA attention layers
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
@@ -615,7 +622,9 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
}
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
}
return res;
}

View File

@@ -39,6 +39,9 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) {
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
create_tensor(tn(LLM_TENSOR_MASKED_EMBD_CENTROIDS, "weight"), {}, TENSOR_NOT_REQUIRED);
create_tensor(tn(LLM_TENSOR_MASKED_EMBD_ORDERING), {}, TENSOR_NOT_REQUIRED);
const int64_t n_embd_backbone = hparams.n_embd_inp();
nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0);

View File

@@ -1393,6 +1393,9 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
//
void server_task_result_cmpl_partial::update(task_result_state & state) {
is_updated = true;
if (is_begin) {
return; // begin marker only flushes headers, skip parsing
}
state.update_chat_msg(content, true, oaicompat_msg_diffs);
// Copy current state for use in to_json_*() (reflects state BEFORE this chunk)