mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-06 11:03:01 +02:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c4a278d68e | ||
|
|
64086f2b2f | ||
|
|
6effcecd0b |
@@ -253,6 +253,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
|
||||
"Glm4vMoeForConditionalGeneration": "qwen3vl",
|
||||
"GlmOcrForConditionalGeneration": "qwen3vl",
|
||||
"GlmasrModel": "ultravox",
|
||||
"Granite4VisionForConditionalGeneration": "granite",
|
||||
"GraniteSpeechForConditionalGeneration": "granite",
|
||||
"HunYuanVLForConditionalGeneration": "hunyuan",
|
||||
"Idefics3ForConditionalGeneration": "smolvlm",
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Callable, Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
@@ -13,7 +14,7 @@ from .llama import LlamaModel
|
||||
from .mamba import Mamba2Model
|
||||
|
||||
|
||||
@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration")
|
||||
@ModelBase.register("GraniteForCausalLM")
|
||||
class GraniteModel(LlamaModel):
|
||||
"""Conversion for IBM's GraniteForCausalLM"""
|
||||
model_arch = gguf.MODEL_ARCH.GRANITE
|
||||
@@ -46,11 +47,29 @@ class GraniteModel(LlamaModel):
|
||||
self.gguf_writer.add_logit_scale(logits_scale)
|
||||
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
||||
|
||||
# If being used as the base for Granite4 Vision, add deepstack_layer_arr
|
||||
if self.hparams.get("spatial_target_layers") or self.hparams.get("deepstack_layer_map"):
|
||||
normalized_projector_map = Granite4VisionMmprojModel.get_normalized_projector_map(self.hparams)
|
||||
deepstack_mapping_arr = [-1 for _ in range(self.block_count)] # Populate with -1 sentinels
|
||||
for proj_idx, (_, llm_layer, _, _) in enumerate(normalized_projector_map):
|
||||
# Skip the first projector which is handled as the base embedding
|
||||
# stream like normal
|
||||
if proj_idx == 0:
|
||||
continue
|
||||
deepstack_mapping_arr[llm_layer] = proj_idx
|
||||
self.gguf_writer.add_deepstack_mapping(deepstack_mapping_arr)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
if name.startswith("encoder."):
|
||||
return None
|
||||
# Skip multimodal tensors
|
||||
if (
|
||||
name.startswith(("encoder."))
|
||||
or "image_" in name
|
||||
or "layerwise_projectors" in name
|
||||
or "spatial_projectors" in name
|
||||
):
|
||||
return
|
||||
return super().filter_tensors(item)
|
||||
|
||||
|
||||
@@ -241,7 +260,8 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
||||
assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
|
||||
|
||||
def set_vocab(self):
|
||||
self.hparams["pad_vocab_size_multiple"] = 8
|
||||
# For models with no ssm layers, don't pad for mamba2
|
||||
self.hparams["pad_vocab_size_multiple"] = 8 if self._ssm_layers else 1
|
||||
Mamba2Model.set_vocab(self)
|
||||
|
||||
|
||||
@@ -326,3 +346,133 @@ class GraniteSpeechMmprojModel(MmprojModel):
|
||||
data_torch = data_torch.squeeze(1)
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Granite4VisionForConditionalGeneration")
|
||||
class Granite4VisionMmprojModel(MmprojModel):
|
||||
has_vision_encoder = True
|
||||
has_audio_encoder = False
|
||||
|
||||
@staticmethod
|
||||
def get_normalized_projector_map(global_config: dict) -> list[tuple[int, int, str, int]]:
|
||||
"""Normalize both deepstack and spatial projector maps to the form:
|
||||
(vision_layer, llm_layer, <type>, type_index)
|
||||
|
||||
This is then used to populate the following mappings:
|
||||
- vision_feature_layers (mmproj hparam): ordered list of all
|
||||
vision_layer values where order corresponds with the order of the
|
||||
stacked projector tensors
|
||||
NOTE: Values may appear multiple times for spatial projectors
|
||||
- tensor_prefix_map (mmproj tensors): mapping from tensor prefixes to
|
||||
the index of the corresponding projector in the stacked tensors
|
||||
- deepstack_layer_arr (llm hparam): per-text-layer array indicating
|
||||
which input vision feature should be injected at that layer
|
||||
(-1 if none)
|
||||
|
||||
Output: (vision_layer, llm_layer, <type>, type_index)
|
||||
"""
|
||||
deepstack_map = global_config.get("deepstack_layer_map", []) # [[vis_layer, llm_layer], ...]
|
||||
spatial_layers = global_config.get("spatial_target_layers", []) # [llm_layer, ...]
|
||||
n_text_layers = global_config["text_config"]["num_hidden_layers"]
|
||||
n_vision_layers = global_config["vision_config"]["num_hidden_layers"]
|
||||
normalized_projector_map = []
|
||||
if deepstack_map:
|
||||
for deepstack_idx, (vision_layer, llm_layer) in enumerate(sorted(deepstack_map)):
|
||||
if vision_layer < 0:
|
||||
vision_layer = n_vision_layers + vision_layer
|
||||
if llm_layer < 0:
|
||||
llm_layer = n_text_layers + llm_layer
|
||||
normalized_projector_map.append((vision_layer, llm_layer, "layerwise", deepstack_idx))
|
||||
if spatial_layers:
|
||||
spatial_vision_layer = global_config.get("spatial_vision_layer", -1)
|
||||
if spatial_vision_layer < 0:
|
||||
spatial_vision_layer = n_vision_layers + spatial_vision_layer
|
||||
for spatial_idx, llm_layer in enumerate(spatial_layers):
|
||||
normalized_projector_map.append((spatial_vision_layer, llm_layer, "spatial", spatial_idx))
|
||||
return list(sorted(normalized_projector_map, key=(lambda entry: entry[1])))
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
normalized_projector_map = self.get_normalized_projector_map(self.global_config)
|
||||
self._n_proj = len(normalized_projector_map)
|
||||
|
||||
self._tensor_prefix_map = {
|
||||
f"model.{proj_type}_projectors.{type_idx}": proj_idx
|
||||
for proj_idx, (_, _, proj_type, type_idx) in enumerate(normalized_projector_map)
|
||||
}
|
||||
self._vision_feature_layers = [vision_layer for vision_layer, _, _, _ in normalized_projector_map]
|
||||
self._spatial_offsets = [
|
||||
type_idx if proj_type == "spatial" else -1
|
||||
for _, _, proj_type, type_idx in normalized_projector_map
|
||||
]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
assert self.hparams_vision is not None
|
||||
super().set_gguf_parameters()
|
||||
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE4_VISION)
|
||||
|
||||
# SigLIP encoder hparams
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
|
||||
# Preprocessor
|
||||
self.gguf_writer.add_vision_preproc_image_size(self.hparams.get("image_size", 384))
|
||||
|
||||
# QFormer projector config
|
||||
ds_rate = self.global_config["downsample_rate"]
|
||||
ds_parts = ds_rate.split("/")
|
||||
assert len(ds_parts) == 2, f"Invalid 'downsample_rate' value: {ds_rate}"
|
||||
query_side, window_side = [int(p) for p in ds_parts]
|
||||
self.gguf_writer.add_vision_projector_query_side(query_side)
|
||||
self.gguf_writer.add_vision_projector_window_side(window_side)
|
||||
|
||||
# Set vision feature layers
|
||||
self.gguf_writer.add_vision_feature_layers(self._vision_feature_layers)
|
||||
|
||||
# Set the spatial offests per projector
|
||||
self.gguf_writer.add_vision_spatial_offsets(self._spatial_offsets)
|
||||
|
||||
# Add flattened image grind pinpoints (resolution candidates internally)
|
||||
if pinpoints := self.global_config.get("image_grid_pinpoints"):
|
||||
# Flatten with h, w -> w, h inversion
|
||||
pinpoints = [val for h, w in pinpoints for val in (w, h)]
|
||||
self.gguf_writer.add_vision_image_grid_pinpoints(pinpoints)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, _ = item
|
||||
if ("vision_model.head" in name or name.startswith("lm_head")):
|
||||
return None
|
||||
return super().filter_tensors(item)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
||||
# Detect projector tensors and bin them
|
||||
projector_idx = None
|
||||
for prefix, proj_idx in self._tensor_prefix_map.items():
|
||||
if name.startswith(prefix):
|
||||
projector_idx = proj_idx
|
||||
break
|
||||
if projector_idx is not None:
|
||||
# If this projector tensor has a block id within the projector,
|
||||
# alias the bid to projector_idx
|
||||
#
|
||||
# TODO: currently, none of the Granite 4 Vision models have
|
||||
# projectors with multiple QFormer layers, so the `layer.{}` index
|
||||
# is always 0. This allows us to simply map to a single `bid` that
|
||||
# matches the projector index. If this changes, we'll need a
|
||||
# convention that merges the two IDs.
|
||||
id_matches = list(re.finditer(r"\.([0-9]+)\.", name))
|
||||
all_ids = [int(m.group(1)) for m in id_matches]
|
||||
assert len(all_ids) >= 1 and len(all_ids) <= 2, "Must have at least 1 and at most 2 ids in tensor names"
|
||||
# If not layer id, just use the projector index
|
||||
new_bid = projector_idx
|
||||
if len(all_ids) == 1:
|
||||
new_name = name[:id_matches[0].span(1)[0]] + str(new_bid) + name[id_matches[0].span(1)[1]:]
|
||||
else: # len(all_ids) == 2
|
||||
new_bid = projector_idx # + all_ids[1]
|
||||
new_name = name[:id_matches[0].span(0)[0]] + name[id_matches[0].span(1)[1]:id_matches[1].span(1)[0]] + str(new_bid) + name[id_matches[1].span(1)[1]:]
|
||||
yield from super().modify_tensors(data_torch, new_name, new_bid)
|
||||
return
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
@@ -311,6 +311,10 @@ def parse_args() -> argparse.Namespace:
|
||||
"--base-model-id", type=str,
|
||||
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust-remote-code", default=False, action="store_true",
|
||||
help="trust remote code in the model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"lora_path", type=Path,
|
||||
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
|
||||
@@ -319,11 +323,11 @@ def parse_args() -> argparse.Namespace:
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
|
||||
def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dict[str, Any], Path | None]:
|
||||
from huggingface_hub import try_to_load_from_cache
|
||||
|
||||
# normally, adapter does not come with base model config, we need to load it from AutoConfig
|
||||
config = AutoConfig.from_pretrained(hf_model_id)
|
||||
config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code)
|
||||
cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
|
||||
cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None
|
||||
|
||||
@@ -372,13 +376,13 @@ if __name__ == '__main__':
|
||||
# load base model
|
||||
if base_model_id is not None:
|
||||
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
||||
hparams, dir_base_model = load_hparams_from_hf(base_model_id)
|
||||
hparams, dir_base_model = load_hparams_from_hf(base_model_id, args.trust_remote_code)
|
||||
elif dir_base_model is None:
|
||||
if "base_model_name_or_path" in lparams:
|
||||
model_id = lparams["base_model_name_or_path"]
|
||||
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||
try:
|
||||
hparams, dir_base_model = load_hparams_from_hf(model_id)
|
||||
hparams, dir_base_model = load_hparams_from_hf(model_id, args.trust_remote_code)
|
||||
except OSError as e:
|
||||
logger.error(f"Failed to load base model config: {e}")
|
||||
logger.error("Please try downloading the base model and add its path to --base")
|
||||
@@ -393,7 +397,9 @@ if __name__ == '__main__':
|
||||
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
model_class = get_model_class(hparams["architectures"][0])
|
||||
model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0]
|
||||
logger.info("Using model architecture: %s", model_arch)
|
||||
model_class = get_model_class(model_arch)
|
||||
except NotImplementedError:
|
||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -128,6 +128,7 @@ class Keys:
|
||||
MOE_LATENT_SIZE = "{arch}.moe_latent_size"
|
||||
NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
|
||||
NUM_DEEPSTACK_LAYERS = "{arch}.n_deepstack_layers"
|
||||
DEEPSTACK_MAPPING = "{arch}.deepstack_mapping"
|
||||
POOLING_TYPE = "{arch}.pooling_type"
|
||||
LOGIT_SCALE = "{arch}.logit_scale"
|
||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||
@@ -325,6 +326,8 @@ class Keys:
|
||||
WA_PATTERN_MODE = "clip.vision.wa_pattern_mode" # used by mimovl, per-layer -1/0/1
|
||||
IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
|
||||
WINDOW_SIZE = "clip.vision.window_size"
|
||||
FEATURE_LAYERS = "clip.vision.feature_layer" # Granite4 Vision
|
||||
IMAGE_GRID_PINPOINTS = "clip.vision.image_grid_pinpoints" # Granite4 Vision
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "clip.vision.attention.head_count"
|
||||
@@ -333,6 +336,9 @@ class Keys:
|
||||
|
||||
class Projector:
|
||||
SCALE_FACTOR = "clip.vision.projector.scale_factor"
|
||||
QUERY_SIDE = "clip.vision.projector.query_side"
|
||||
WINDOW_SIDE = "clip.vision.projector.window_side"
|
||||
SPATIAL_OFFSETS = "clip.vision.projector.spatial_offsets"
|
||||
|
||||
class SAM:
|
||||
BLOCK_COUNT = "clip.vision.sam.block_count"
|
||||
@@ -821,6 +827,31 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_RESMPL_QUERY_768 = auto() # Deepseek-OCR-2
|
||||
V_RESMPL_QUERY_1024 = auto() # Deepseek-OCR-2
|
||||
|
||||
# qformer projector (vision) - Granite4 Vision
|
||||
V_QF_PROJ_QUERY = auto()
|
||||
V_QF_PROJ_NORM = auto()
|
||||
V_QF_PROJ_LINEAR = auto()
|
||||
V_QF_SELF_ATTN_Q = auto()
|
||||
V_QF_SELF_ATTN_K = auto()
|
||||
V_QF_SELF_ATTN_V = auto()
|
||||
V_QF_SELF_ATTN_O = auto()
|
||||
V_QF_SELF_ATTN_NORM = auto()
|
||||
V_QF_CROSS_ATTN_Q = auto()
|
||||
V_QF_CROSS_ATTN_K = auto()
|
||||
V_QF_CROSS_ATTN_V = auto()
|
||||
V_QF_CROSS_ATTN_O = auto()
|
||||
V_QF_CROSS_ATTN_NORM = auto()
|
||||
V_QF_FFN_UP = auto()
|
||||
V_QF_FFN_DOWN = auto()
|
||||
V_QF_FFN_NORM = auto()
|
||||
V_PROJ_NORM = auto()
|
||||
# multi-projector (bid => projector id) - Granite4 vision
|
||||
V_MULTI_PROJ_IMG_POS = auto()
|
||||
V_MULTI_PROJ_QUERY = auto()
|
||||
V_MULTI_PROJ_NORM = auto()
|
||||
V_MULTI_PROJ_LINEAR = auto()
|
||||
V_MULTI_PROJ_POST_NORM = auto()
|
||||
|
||||
# audio (mtmd)
|
||||
A_ENC_EMBD_POS = auto()
|
||||
A_ENC_EMBD_NORM = auto()
|
||||
@@ -885,7 +916,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
A_CTC_OUT = auto()
|
||||
A_CTC_OUT_MID = auto()
|
||||
A_ENC_ATTN_REL_POS_EMB = auto()
|
||||
# qformer projector
|
||||
# audio qformer projector
|
||||
A_QF_PROJ_QUERY = auto()
|
||||
A_QF_PROJ_NORM = auto()
|
||||
A_QF_PROJ_LINEAR = auto()
|
||||
@@ -1337,10 +1368,33 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}",
|
||||
MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2",
|
||||
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR, Granite4Vision
|
||||
MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR
|
||||
MODEL_TENSOR.V_RESMPL_QUERY_768: "v.resample_query_768", # Deepseek-OCR-2 qwen2
|
||||
MODEL_TENSOR.V_RESMPL_QUERY_1024: "v.resample_query_1024", # Deepseek-OCR-2 qwen2
|
||||
# Granite4 Vision
|
||||
# qformer layers (bid => proj_id)
|
||||
# NOTE: Names align with A_QF_*
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_Q: "v.proj_blk.{bid}.self_attn_q",
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_K: "v.proj_blk.{bid}.self_attn_k",
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_V: "v.proj_blk.{bid}.self_attn_v",
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_O: "v.proj_blk.{bid}.self_attn_out",
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_NORM: "v.proj_blk.{bid}.self_attn_norm",
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_Q: "v.proj_blk.{bid}.cross_attn_q",
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_K: "v.proj_blk.{bid}.cross_attn_k",
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_V: "v.proj_blk.{bid}.cross_attn_v",
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_O: "v.proj_blk.{bid}.cross_attn_out",
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_NORM: "v.proj_blk.{bid}.cross_attn_norm",
|
||||
MODEL_TENSOR.V_QF_FFN_UP: "v.proj_blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.V_QF_FFN_DOWN: "v.proj_blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.V_QF_FFN_NORM: "v.proj_blk.{bid}.ffn_norm",
|
||||
# multi-projector (bid => projector ID)
|
||||
MODEL_TENSOR.V_MULTI_PROJ_IMG_POS: "v.proj_blk.{bid}.img_pos",
|
||||
MODEL_TENSOR.V_MULTI_PROJ_QUERY: "v.proj_blk.{bid}.query",
|
||||
MODEL_TENSOR.V_MULTI_PROJ_NORM: "v.proj_blk.{bid}.norm",
|
||||
MODEL_TENSOR.V_MULTI_PROJ_LINEAR: "v.proj_blk.{bid}.linear",
|
||||
MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: "v.proj_blk.{bid}.post_norm",
|
||||
|
||||
# audio (mtmd)
|
||||
# note: all audio tensor names must use prefix "a." or "mm.a."
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
||||
@@ -1522,6 +1576,29 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.V_SAM_NET_3,
|
||||
MODEL_TENSOR.V_RESMPL_QUERY_768,
|
||||
MODEL_TENSOR.V_RESMPL_QUERY_1024,
|
||||
MODEL_TENSOR.V_PROJ_NORM,
|
||||
MODEL_TENSOR.V_QF_PROJ_QUERY,
|
||||
MODEL_TENSOR.V_QF_PROJ_NORM,
|
||||
MODEL_TENSOR.V_QF_PROJ_LINEAR,
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_Q,
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_K,
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_V,
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_O,
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_NORM,
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_Q,
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_K,
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_V,
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_O,
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_NORM,
|
||||
MODEL_TENSOR.V_QF_FFN_UP,
|
||||
MODEL_TENSOR.V_QF_FFN_DOWN,
|
||||
MODEL_TENSOR.V_QF_FFN_NORM,
|
||||
MODEL_TENSOR.V_QF_PROJ_NORM,
|
||||
MODEL_TENSOR.V_MULTI_PROJ_IMG_POS,
|
||||
MODEL_TENSOR.V_MULTI_PROJ_QUERY,
|
||||
MODEL_TENSOR.V_MULTI_PROJ_LINEAR,
|
||||
MODEL_TENSOR.V_MULTI_PROJ_NORM,
|
||||
MODEL_TENSOR.V_MULTI_PROJ_POST_NORM,
|
||||
# audio
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
||||
@@ -4388,6 +4465,7 @@ class VisionProjectorType:
|
||||
MINICPMV4_6 = "minicpmv4_6"
|
||||
GRANITE_SPEECH = "granite_speech" # audio
|
||||
MIMOVL = "mimovl"
|
||||
GRANITE4_VISION = "granite4_vision"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -959,8 +959,13 @@ class GGUFWriter:
|
||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||
|
||||
def add_num_deepstack_layers(self, count: int) -> None:
|
||||
"""Add scalar deepstack layer count (qwen3vl format)"""
|
||||
self.add_uint32(Keys.LLM.NUM_DEEPSTACK_LAYERS.format(arch=self.arch), count)
|
||||
|
||||
def add_deepstack_mapping(self, layers: Sequence[int]) -> None:
|
||||
"""Add per-layer deepstack projector indices (Granite4 Vision format)"""
|
||||
self.add_array(Keys.LLM.DEEPSTACK_MAPPING.format(arch=self.arch), list(layers))
|
||||
|
||||
def add_rope_dimension_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
||||
|
||||
@@ -1184,6 +1189,15 @@ class GGUFWriter:
|
||||
def add_vision_preproc_image_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
|
||||
|
||||
def add_vision_projector_query_side(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.Projector.QUERY_SIDE, value)
|
||||
|
||||
def add_vision_projector_window_side(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.Projector.WINDOW_SIDE, value)
|
||||
|
||||
def add_vision_spatial_offsets(self, layers: Sequence[int]) -> None:
|
||||
self.add_array(Keys.ClipVision.Projector.SPATIAL_OFFSETS, layers)
|
||||
|
||||
def add_vision_image_mean(self, values: Sequence[float]) -> None:
|
||||
self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
|
||||
|
||||
@@ -1240,6 +1254,12 @@ class GGUFWriter:
|
||||
def add_vision_window_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value)
|
||||
|
||||
def add_vision_feature_layers(self, layers: Sequence[int]) -> None:
|
||||
self.add_array(Keys.ClipVision.FEATURE_LAYERS, layers)
|
||||
|
||||
def add_vision_image_grid_pinpoints(self, layers: Sequence[Sequence[int]]) -> None:
|
||||
self.add_array(Keys.ClipVision.IMAGE_GRID_PINPOINTS, layers)
|
||||
|
||||
def add_vision_sam_layers_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value)
|
||||
|
||||
|
||||
@@ -1408,6 +1408,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
|
||||
"model.vision_tower.vision_model.embeddings.patch_embedding", # Granite4Vision
|
||||
"vision_tower.vision_model.embeddings.patch_embedding",
|
||||
"model.vision_tower.embeddings.patch_embedding", # minicpmv4_6
|
||||
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
|
||||
@@ -1439,6 +1440,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
||||
"model.vision_tower.vision_model.embeddings.position_embedding", # Granite4Vision
|
||||
"vision_tower.vision_model.embeddings.position_embedding",
|
||||
"model.vision_tower.embeddings.position_embedding", # minicpmv4_6
|
||||
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
|
||||
@@ -1456,8 +1458,9 @@ class TensorNameMap:
|
||||
"model.vision_embedder.pos_embedding", # gemma4 unified
|
||||
),
|
||||
|
||||
# TODO: I think these should all be moved to mapping_cfg?
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
|
||||
"model.image_newline", # Deepseek-OCR
|
||||
"model.image_newline", # Deepseek-OCR, Granite4Vision
|
||||
"vit.perceive.image_newline", # HunyuanVL
|
||||
),
|
||||
|
||||
@@ -1477,6 +1480,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
||||
"model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", # Granite4Vision
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
|
||||
"model.vision_tower.encoder.layers.{bid}.self_attn.q_proj", # minicpmv4_6
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
|
||||
@@ -1502,6 +1506,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||
"model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", # Granite4Vision
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"model.vision_tower.encoder.layers.{bid}.self_attn.k_proj", # minicpmv4_6
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
|
||||
@@ -1527,6 +1532,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||
"model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", # Granite4Vision
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_tower.encoder.layers.{bid}.self_attn.v_proj", # minicpmv4_6
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
|
||||
@@ -1545,6 +1551,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
"model.vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", # Granite4Vision
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||
"model.vision_tower.encoder.layers.{bid}.layer_norm1", # minicpmv4_6
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
|
||||
@@ -1567,6 +1574,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_O: (
|
||||
"model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", # Granite4Vision
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"model.vision_tower.encoder.layers.{bid}.self_attn.out_proj", # minicpmv4_6
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
|
||||
@@ -1595,6 +1603,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
|
||||
"model.vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", # Granite4Vision
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
"model.vision_tower.encoder.layers.{bid}.layer_norm2", # minicpmv4_6
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
|
||||
@@ -1618,6 +1627,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
"model.vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", # Granite4Vision
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
"model.vision_tower.encoder.layers.{bid}.mlp.fc1", # minicpmv4_6
|
||||
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
|
||||
@@ -1649,6 +1659,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||
"model.vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", # Granite4Vision
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
"model.vision_tower.encoder.layers.{bid}.mlp.fc2", # minicpmv4_6
|
||||
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
|
||||
@@ -1706,6 +1717,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_POST_NORM: (
|
||||
"model.vision_tower.vision_model.post_layernorm", # Granite4Vision
|
||||
"vision_tower.vision_model.post_layernorm",
|
||||
"model.vision_tower.post_layernorm", # minicpmv4_6
|
||||
"model.vision_model.post_layernorm", # SmolVLM
|
||||
@@ -1952,6 +1964,82 @@ class TensorNameMap:
|
||||
"model.vision_tower.std_scale", # gemma4
|
||||
),
|
||||
|
||||
# For these tensors, bid => projector ID
|
||||
MODEL_TENSOR.V_MULTI_PROJ_IMG_POS: (
|
||||
"model.layerwise_projectors.{bid}.image_positions", # Granite4 Vision
|
||||
"model.spatial_projectors.{bid}.image_positions", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_MULTI_PROJ_QUERY: (
|
||||
"model.layerwise_projectors.{bid}.query", # Granite4 Vision
|
||||
"model.spatial_projectors.{bid}.query", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_MULTI_PROJ_LINEAR: (
|
||||
"model.layerwise_projectors.{bid}.out_linear", # Granite4 Vision
|
||||
"model.spatial_projectors.{bid}.out_linear", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_MULTI_PROJ_NORM: (
|
||||
"model.layerwise_projectors.{bid}.norm", # Granite4 Vision
|
||||
"model.spatial_projectors.{bid}.norm", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: (
|
||||
"model.layerwise_projectors.{bid}.qformer.layernorm", # Granite4 Vision
|
||||
"model.spatial_projectors.{bid}.qformer.layernorm", # Granite4 Vision
|
||||
),
|
||||
|
||||
# For these tensors, bid => proj-id
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_Q: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.query", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.query", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_K: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.key", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.key", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_V: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.value", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.value", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_O: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.output.dense", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.attention.output.dense", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_SELF_ATTN_NORM: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.output.LayerNorm", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.attention.output.LayerNorm", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_Q: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.query", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.query", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_K: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.key", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.key", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_V: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.value", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.value", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_O: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.output.dense", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.output.dense", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_CROSS_ATTN_NORM: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_FFN_UP: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.intermediate_query.dense", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.intermediate_query.dense", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_FFN_DOWN: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.output_query.dense", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.output_query.dense", # Granite4 Vision
|
||||
),
|
||||
MODEL_TENSOR.V_QF_FFN_NORM: (
|
||||
"model.layerwise_projectors.qformer.encoder.layer.{bid}.output_query.LayerNorm", # Granite4 Vision
|
||||
"model.spatial_projectors.qformer.encoder.layer.{bid}.output_query.LayerNorm", # Granite4 Vision
|
||||
),
|
||||
|
||||
# audio (mtmd)
|
||||
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS: (
|
||||
|
||||
@@ -196,6 +196,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" },
|
||||
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
||||
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
|
||||
{ LLM_KV_DEEPSTACK_MAPPING, "%s.deepstack_mapping" },
|
||||
{ LLM_KV_HIDDEN_ACT, "%s.hidden_activation" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
|
||||
@@ -200,6 +200,7 @@ enum llm_kv {
|
||||
LLM_KV_MOE_LATENT_SIZE,
|
||||
LLM_KV_NEXTN_PREDICT_LAYERS,
|
||||
LLM_KV_NUM_DEEPSTACK_LAYERS,
|
||||
LLM_KV_DEEPSTACK_MAPPING,
|
||||
LLM_KV_HIDDEN_ACT,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
|
||||
@@ -1859,7 +1859,12 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
||||
res->t_inp_embd = cur;
|
||||
|
||||
// For Granite architecture
|
||||
if (hparams.f_embedding_scale != 0.0f) {
|
||||
// NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be
|
||||
// multimodal inputs that should not be scaled.
|
||||
if (ubatch.token && hparams.f_embedding_scale != 0.0f) {
|
||||
if (!ggml_is_contiguous(cur)) {
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
}
|
||||
cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
|
||||
}
|
||||
|
||||
|
||||
@@ -219,8 +219,18 @@ struct llama_hparams {
|
||||
uint32_t indexer_top_k = 0;
|
||||
|
||||
// qwen3vl deepstack
|
||||
// When parsed from GGUF, this implies the first N layers consume the first
|
||||
// N deepstack embeddings. Use deepstack_mapping_arr if you need a more
|
||||
// complex mapping. If using deepstack_mapping_arr, also make sure to set
|
||||
// n_deepstack_layers to the number of unique deepstack layers so that
|
||||
// n_embd_imp is accurate (see granite.cpp).
|
||||
uint32_t n_deepstack_layers = 0;
|
||||
|
||||
// deepstack layer array (Granite4 Vision)
|
||||
// -1 => no deepstack
|
||||
// >=0 => input embedding index for deepstack injection
|
||||
std::array<int32_t, LLAMA_MAX_LAYERS> deepstack_mapping_arr;
|
||||
|
||||
// gemma4 per-layer embedding
|
||||
uint32_t n_embd_per_layer = 0;
|
||||
|
||||
|
||||
@@ -393,6 +393,7 @@ namespace GGUFMeta {
|
||||
}
|
||||
|
||||
template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
|
||||
template bool llama_model_loader::get_arr<std::array<int32_t, 512>>(enum llm_kv kid, std::array<int32_t, 512> & result, bool required);
|
||||
|
||||
template<typename T>
|
||||
bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
|
||||
|
||||
@@ -229,6 +229,7 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers);
|
||||
add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn);
|
||||
add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers);
|
||||
add_kv(LLM_KV_DEEPSTACK_MAPPING, hparams.deepstack_mapping_arr);
|
||||
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
||||
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
|
||||
|
||||
@@ -553,10 +553,12 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
};
|
||||
|
||||
auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector<std::pair<int64_t, uint32_t>> & segments) -> std::vector<int64_t> {
|
||||
// for better performance it may make sense to round up blck_size to a higher power of 2 so that more efficient kernels can be used
|
||||
if (hparams.is_recr(il)) {
|
||||
// linear attention
|
||||
const int64_t head_dim = hparams.ssm_d_state;
|
||||
const int64_t granularity_qkv = std::lcm(blck_size, head_dim);
|
||||
const int64_t head_dim = hparams.ssm_d_state;
|
||||
const int64_t blck_size_perf = std::lcm(blck_size, 128);
|
||||
const int64_t granularity_qkv = std::lcm(blck_size_perf, head_dim);
|
||||
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_attn_gate_weight) ||
|
||||
std::regex_match(tensor_name, pattern_ssm_conv1d) || std::regex_match(tensor_name, pattern_ssm_out_weight)) {
|
||||
return std::vector<int64_t>(segments.size(), granularity_qkv);
|
||||
@@ -578,17 +580,24 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
// regular attention
|
||||
const uint32_t n_gqa = hparams.n_gqa(il);
|
||||
const uint32_t n_embd_q = n_gqa * hparams.n_embd_head_k(il);
|
||||
if (std::regex_match(tensor_name, pattern_attn_sinks)) {
|
||||
GGML_ASSERT(segments.size() == 1);
|
||||
return {std::lcm(n_embd_q, blck_size)/n_embd_q * n_gqa};
|
||||
|
||||
// to handle head sizes like 80, only increase granularity while it doesn't cause underutilization
|
||||
int64_t blck_size_perf = blck_size;
|
||||
while (blck_size_perf < 128 && blck_size_perf*ud->n_devices < n_embd_q) {
|
||||
blck_size_perf *= 2;
|
||||
}
|
||||
|
||||
const int64_t granularity_q = std::lcm(n_embd_q, blck_size);
|
||||
if (std::regex_match(tensor_name, pattern_attn_sinks)) {
|
||||
GGML_ASSERT(segments.size() == 1);
|
||||
return {std::lcm(n_embd_q, blck_size_perf)/n_embd_q * n_gqa};
|
||||
}
|
||||
|
||||
const int64_t granularity_q = std::lcm(n_embd_q, blck_size_perf);
|
||||
if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_q_bias)) {
|
||||
GGML_ASSERT(segments.size() == 1);
|
||||
// some models have Q gate tensors, for those cases the granularity needs to be doubled:
|
||||
if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) {
|
||||
return {std::lcm(2*n_embd_q, blck_size)};
|
||||
return {std::lcm(2*n_embd_q, blck_size_perf)};
|
||||
}
|
||||
return {granularity_q};
|
||||
}
|
||||
@@ -613,8 +622,9 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
// FFN
|
||||
if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight) || std::regex_match(tensor_name, pattern_ffn_up_gate_bias) ||
|
||||
std::regex_match(tensor_name, pattern_ffn_gate_up_weight) || std::regex_match(tensor_name, pattern_ffn_down_weight)) {
|
||||
const int64_t blck_size_perf = std::lcm(blck_size, 128);
|
||||
GGML_ASSERT(segments.size() == 1);
|
||||
return {blck_size};
|
||||
return {blck_size_perf};
|
||||
}
|
||||
|
||||
// everything else
|
||||
@@ -627,7 +637,6 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
tensor_config tc = get_tensor_config();
|
||||
split_state.axis = tc.axis;
|
||||
if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) {
|
||||
const int64_t ne_full = tensor->ne[split_state.axis];
|
||||
const int64_t blck_size = ggml_blck_size(tc.tensor_axis_0->type);
|
||||
const float * tensor_split = ud->model->tensor_split();
|
||||
std::vector<float> tensor_split_scan;
|
||||
@@ -644,7 +653,6 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
const int64_t ne_s = segments[is].first;
|
||||
const uint32_t nr_s = segments[is].second;
|
||||
const int64_t g_s = granularity[is];
|
||||
GGML_ASSERT(ne_full % g_s == 0);
|
||||
int64_t low = 0;
|
||||
size_t j = 0;
|
||||
for (; j < ud->n_devices - 1; j++) {
|
||||
@@ -1092,6 +1100,9 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer(), false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false);
|
||||
|
||||
// Populate deepstack_mapping_arr - initialized to -1 (no deepstack)
|
||||
std::fill(hparams.deepstack_mapping_arr.begin(), hparams.deepstack_mapping_arr.end(), -1);
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
|
||||
@@ -1670,10 +1681,10 @@ uint64_t llama_model::n_elements() const {
|
||||
void llama_model::print_info() const {
|
||||
const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
|
||||
|
||||
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
||||
auto print_f = [](const std::function<int32_t(uint32_t)> & f, uint32_t n) {
|
||||
bool is_var = false;
|
||||
|
||||
std::vector<uint32_t> v;
|
||||
std::vector<int32_t> v;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
v.push_back(f(i));
|
||||
if (v[i] != v[0]) {
|
||||
@@ -1747,6 +1758,14 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
||||
if (arch == LLM_ARCH_GRANITE &&
|
||||
std::any_of(hparams.deepstack_mapping_arr.begin(),
|
||||
hparams.deepstack_mapping_arr.end(),
|
||||
[](const auto & entry) { return entry >= 0; })) {
|
||||
LLAMA_LOG_INFO("%s: deepstack_mapping_arr = %s\n", __func__,
|
||||
print_f([&](uint32_t il) { return hparams.deepstack_mapping_arr[il]; },
|
||||
hparams.n_layer()).c_str());
|
||||
}
|
||||
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
||||
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
||||
LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#include "models.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
void llama_model_granite::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
@@ -7,6 +9,27 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, false);
|
||||
|
||||
// Granite4 Vision uses array deepstack_mapping
|
||||
ml.get_arr(LLM_KV_DEEPSTACK_MAPPING, hparams.deepstack_mapping_arr, false);
|
||||
|
||||
// Count the unique deepstack input indices
|
||||
std::unordered_set<uint32_t> unique_deepstack_idxs;
|
||||
for (const auto val : hparams.deepstack_mapping_arr) {
|
||||
if (val >= 0) {
|
||||
unique_deepstack_idxs.insert(val);
|
||||
}
|
||||
}
|
||||
hparams.n_deepstack_layers = unique_deepstack_idxs.size();
|
||||
|
||||
// Ensure all values are valid (avoid overflow attacks)
|
||||
for (const auto val : unique_deepstack_idxs) {
|
||||
if (val > hparams.n_deepstack_layers) {
|
||||
std::stringstream ss;
|
||||
ss << "Invalid deepstack index: " << val << " > " << hparams.n_deepstack_layers;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
// Granite uses rope_finetuned as a switch for rope, so default to true
|
||||
bool rope_finetuned = true;
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
||||
@@ -112,6 +135,20 @@ llama_model_granite::graph::graph(
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
// Granite Vision 4.1 deepstack: inject the projector stream that
|
||||
// targets decoder layer `il` before the decoder runs.
|
||||
// NOTE: skip the first deepstack layer since that's inpL
|
||||
const auto & deepstack_emb_idx = hparams.deepstack_mapping_arr[il];
|
||||
if (il > 0 && deepstack_emb_idx >= 0) {
|
||||
ggml_tensor * ds = ggml_view_2d(ctx0,
|
||||
res->t_inp_embd, n_embd, n_tokens,
|
||||
res->t_inp_embd->nb[1],
|
||||
deepstack_emb_idx * n_embd * sizeof(float));
|
||||
inpL = ggml_add(ctx0, inpL, ds);
|
||||
cb(inpL, "deepstack_in", il);
|
||||
}
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
||||
@@ -25,6 +25,7 @@ add_library(mtmd
|
||||
models/gemma4uv.cpp
|
||||
models/glm4v.cpp
|
||||
models/granite-speech.cpp
|
||||
models/granite4-vision.cpp
|
||||
models/hunyuanvl.cpp
|
||||
models/internvl.cpp
|
||||
models/kimivl.cpp
|
||||
|
||||
@@ -35,20 +35,22 @@
|
||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||
|
||||
// vision-specific
|
||||
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels"
|
||||
#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels"
|
||||
#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles"
|
||||
#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles"
|
||||
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
||||
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
|
||||
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
|
||||
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels"
|
||||
#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels"
|
||||
#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles"
|
||||
#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles"
|
||||
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
||||
#define KEY_PROJ_SAMPLE_QUERY_SIDE "clip.vision.projector.query_side"
|
||||
#define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side"
|
||||
#define KEY_PROJ_SPATIAL_OFFSETS "clip.vision.projector.spatial_offsets"
|
||||
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
@@ -72,7 +74,6 @@
|
||||
#define KEY_A_PROJ_DOWNSAMPLE_RATE "clip.audio.projector.downsample_rate"
|
||||
#define KEY_A_PROJ_HEAD_COUNT "clip.audio.projector.head_count"
|
||||
|
||||
|
||||
//
|
||||
// tensor name constants
|
||||
//
|
||||
@@ -210,22 +211,28 @@
|
||||
#define TN_CTC_OUT_MID "a.enc_ctc_out_mid.%s"
|
||||
#define TN_ATTN_REL_POS_EMB "%s.blk.%d.attn_rel_pos_emb"
|
||||
// qformer projector
|
||||
#define TN_QF_PROJ_QUERY "a.proj_query"
|
||||
#define TN_QF_PROJ_NORM "a.proj_norm.%s"
|
||||
#define TN_QF_PROJ_LINEAR "a.proj_linear.%s"
|
||||
#define TN_QF_SELF_ATTN_Q "a.proj_blk.%d.self_attn_q.%s"
|
||||
#define TN_QF_SELF_ATTN_K "a.proj_blk.%d.self_attn_k.%s"
|
||||
#define TN_QF_SELF_ATTN_V "a.proj_blk.%d.self_attn_v.%s"
|
||||
#define TN_QF_SELF_ATTN_O "a.proj_blk.%d.self_attn_out.%s"
|
||||
#define TN_QF_SELF_ATTN_N "a.proj_blk.%d.self_attn_norm.%s"
|
||||
#define TN_QF_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s"
|
||||
#define TN_QF_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s"
|
||||
#define TN_QF_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s"
|
||||
#define TN_QF_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s"
|
||||
#define TN_QF_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s"
|
||||
#define TN_QF_FFN_UP "a.proj_blk.%d.ffn_up.%s"
|
||||
#define TN_QF_FFN_DOWN "a.proj_blk.%d.ffn_down.%s"
|
||||
#define TN_QF_FFN_NORM "a.proj_blk.%d.ffn_norm.%s"
|
||||
#define TN_QF_PROJ_QUERY "%s.proj_query"
|
||||
#define TN_QF_PROJ_NORM "%s.proj_norm.%s"
|
||||
#define TN_QF_PROJ_LINEAR "%s.proj_linear.%s"
|
||||
#define TN_QF_SELF_ATTN_Q "%s.proj_blk.%d.self_attn_q.%s"
|
||||
#define TN_QF_SELF_ATTN_K "%s.proj_blk.%d.self_attn_k.%s"
|
||||
#define TN_QF_SELF_ATTN_V "%s.proj_blk.%d.self_attn_v.%s"
|
||||
#define TN_QF_SELF_ATTN_O "%s.proj_blk.%d.self_attn_out.%s"
|
||||
#define TN_QF_SELF_ATTN_N "%s.proj_blk.%d.self_attn_norm.%s"
|
||||
#define TN_QF_CROSS_ATTN_Q "%s.proj_blk.%d.cross_attn_q.%s"
|
||||
#define TN_QF_CROSS_ATTN_K "%s.proj_blk.%d.cross_attn_k.%s"
|
||||
#define TN_QF_CROSS_ATTN_V "%s.proj_blk.%d.cross_attn_v.%s"
|
||||
#define TN_QF_CROSS_ATTN_O "%s.proj_blk.%d.cross_attn_out.%s"
|
||||
#define TN_QF_CROSS_ATTN_N "%s.proj_blk.%d.cross_attn_norm.%s"
|
||||
#define TN_QF_FFN_UP "%s.proj_blk.%d.ffn_up.%s"
|
||||
#define TN_QF_FFN_DOWN "%s.proj_blk.%d.ffn_down.%s"
|
||||
#define TN_QF_FFN_NORM "%s.proj_blk.%d.ffn_norm.%s"
|
||||
// multi-projector qformer (bid => projector ID)
|
||||
#define TN_MULTI_PROJ_IMG_POS "v.proj_blk.%d.img_pos"
|
||||
#define TN_MULTI_PROJ_QUERY "%s.proj_blk.%d.query"
|
||||
#define TN_MULTI_PROJ_LINEAR "%s.proj_blk.%d.linear.%s"
|
||||
#define TN_MULTI_PROJ_NORM "%s.proj_blk.%d.norm.%s"
|
||||
#define TN_MULTI_PROJ_POST_NORM "%s.proj_blk.%d.post_norm.%s"
|
||||
|
||||
// gemma4 audio conformer
|
||||
#define TN_A_MM_INP_PROJ "mm.a.input_projection.%s"
|
||||
@@ -354,6 +361,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_MINICPMV4_6,
|
||||
PROJECTOR_TYPE_GRANITE_SPEECH,
|
||||
PROJECTOR_TYPE_MIMOVL,
|
||||
PROJECTOR_TYPE_GRANITE4_VISION,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -407,6 +415,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"},
|
||||
{ PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},
|
||||
{ PROJECTOR_TYPE_MIMOVL, "mimovl"},
|
||||
{ PROJECTOR_TYPE_GRANITE4_VISION, "granite4_vision"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
@@ -438,6 +447,8 @@ struct clip_image_f32 {
|
||||
|
||||
// marks the global view in e.g., DeepSeek-OCR Models
|
||||
bool add_viewsep = false;
|
||||
// whether a learned newline token should be appended after the image (eg Granite4 Vision)
|
||||
bool add_newline = false;
|
||||
};
|
||||
|
||||
//
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "clip.h"
|
||||
#include "clip-impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
@@ -90,7 +91,7 @@ struct clip_hparams {
|
||||
|
||||
float eps = 1e-6;
|
||||
float rope_theta = 0.0;
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
std::vector<int32_t> vision_feature_layer;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
|
||||
@@ -101,6 +102,11 @@ struct clip_hparams {
|
||||
int32_t sam_n_head = 0;
|
||||
int32_t sam_n_embd = 0;
|
||||
|
||||
// Granite4 Vision
|
||||
std::vector<int32_t> proj_spatial_offsets;
|
||||
int32_t downsample_query_side;
|
||||
int32_t downsample_window_side;
|
||||
|
||||
// audio
|
||||
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||
int32_t proj_stack_factor = 0; // ultravox
|
||||
@@ -158,6 +164,10 @@ struct clip_hparams {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_vision_feature_layer(int32_t layer) const {
|
||||
return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end();
|
||||
}
|
||||
};
|
||||
|
||||
struct clip_layer {
|
||||
@@ -325,6 +335,20 @@ struct yasa2_stage {
|
||||
std::vector<yasa2_block> blocks;
|
||||
};
|
||||
|
||||
// QFormer projector block for models with 1 (or more) QFormer projectors
|
||||
// Granite Speech, Granite4 Vision
|
||||
struct qf_block {
|
||||
ggml_tensor * qf_proj_query = nullptr;
|
||||
ggml_tensor * qf_proj_norm_w = nullptr;
|
||||
ggml_tensor * qf_proj_norm_b = nullptr;
|
||||
ggml_tensor * qf_proj_linear_w = nullptr;
|
||||
ggml_tensor * qf_proj_linear_b = nullptr;
|
||||
ggml_tensor * qf_proj_post_norm_w = nullptr;
|
||||
ggml_tensor * qf_proj_post_norm_b = nullptr;
|
||||
ggml_tensor * qf_proj_img_pos = nullptr; // Vision only
|
||||
std::vector<clip_layer> qf_proj_layers;
|
||||
};
|
||||
|
||||
struct clip_model {
|
||||
clip_modality modality = CLIP_MODALITY_VISION;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
@@ -589,13 +613,8 @@ struct clip_model {
|
||||
ggml_tensor * ctc_out_b = nullptr;
|
||||
ggml_tensor * ctc_out_mid_w = nullptr;
|
||||
ggml_tensor * ctc_out_mid_b = nullptr;
|
||||
// qformer projector
|
||||
ggml_tensor * qf_proj_query = nullptr;
|
||||
ggml_tensor * qf_proj_norm_w = nullptr;
|
||||
ggml_tensor * qf_proj_norm_b = nullptr;
|
||||
ggml_tensor * qf_proj_linear_w = nullptr;
|
||||
ggml_tensor * qf_proj_linear_b = nullptr;
|
||||
std::vector<clip_layer> qf_proj_layers;
|
||||
// qformer projector(s)
|
||||
std::vector<qf_block> qf_proj_blocks;
|
||||
|
||||
bool audio_has_avgpool() const {
|
||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||
|
||||
@@ -997,6 +997,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_yasa2>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE4_VISION:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_granite4_vision>(ctx, img);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("missing cgraph builder");
|
||||
}
|
||||
@@ -1234,12 +1238,7 @@ struct clip_model_loader {
|
||||
// to form the final visual features.
|
||||
// NOTE: gguf conversions should standardize the values of the vision feature layer to
|
||||
// be non-negative, since we use -1 to mark values as unset here.
|
||||
std::vector<int> vision_feature_layer;
|
||||
get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false);
|
||||
// convert std::vector to std::unordered_set
|
||||
for (auto & layer : vision_feature_layer) {
|
||||
hparams.vision_feature_layer.insert(layer);
|
||||
}
|
||||
get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false);
|
||||
|
||||
// model-specific params
|
||||
switch (model.proj_type) {
|
||||
@@ -1627,6 +1626,23 @@ struct clip_model_loader {
|
||||
hparams.image_pad_color = {127, 127, 127};
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE4_VISION:
|
||||
{
|
||||
// SigLIP tower.
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
|
||||
hparams.image_resize_pad = PAD_CEIL;
|
||||
|
||||
get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer);
|
||||
get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets);
|
||||
if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) {
|
||||
throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d",
|
||||
hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size()));
|
||||
}
|
||||
|
||||
get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE, hparams.downsample_query_side);
|
||||
get_u32(KEY_PROJ_SAMPLE_WINDOW_SIDE, hparams.downsample_window_side);
|
||||
hparams.warmup_image_size = hparams.image_size;
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unknown vision projector type %s\n", __func__, proj_type.c_str()));
|
||||
}
|
||||
@@ -2628,47 +2644,106 @@ struct clip_model_loader {
|
||||
layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"));
|
||||
}
|
||||
|
||||
model.qf_proj_query = get_tensor(TN_QF_PROJ_QUERY);
|
||||
model.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, "weight"));
|
||||
model.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, "bias"));
|
||||
model.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, "weight"));
|
||||
model.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, "bias"));
|
||||
model.qf_proj_blocks.resize(1);
|
||||
auto & qf = model.qf_proj_blocks[0];
|
||||
qf.qf_proj_query = get_tensor(string_format(TN_QF_PROJ_QUERY, prefix));
|
||||
qf.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, prefix, "weight"));
|
||||
qf.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, prefix, "bias"));
|
||||
qf.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, prefix, "weight"));
|
||||
qf.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, prefix, "bias"));
|
||||
|
||||
const int n_proj_layers = 2;
|
||||
model.qf_proj_layers.resize(n_proj_layers);
|
||||
qf.qf_proj_layers.resize(n_proj_layers);
|
||||
for (int il = 0; il < n_proj_layers; ++il) {
|
||||
auto & pl = model.qf_proj_layers[il];
|
||||
auto & pl = qf.qf_proj_layers[il];
|
||||
|
||||
pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "weight"));
|
||||
pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "bias"));
|
||||
pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "weight"));
|
||||
pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "bias"));
|
||||
pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "weight"));
|
||||
pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "bias"));
|
||||
pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "weight"));
|
||||
pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "bias"));
|
||||
pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "weight"));
|
||||
pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "bias"));
|
||||
pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, il, "weight"));
|
||||
pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, il, "bias"));
|
||||
pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, il, "weight"));
|
||||
pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, il, "bias"));
|
||||
pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, il, "weight"));
|
||||
pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, il, "bias"));
|
||||
pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, il, "weight"));
|
||||
pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, il, "bias"));
|
||||
pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, il, "weight"));
|
||||
pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, il, "bias"));
|
||||
|
||||
pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "weight"));
|
||||
pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "bias"));
|
||||
pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "weight"));
|
||||
pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "bias"));
|
||||
pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "weight"));
|
||||
pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "bias"));
|
||||
pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "weight"));
|
||||
pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "bias"));
|
||||
pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "weight"));
|
||||
pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "bias"));
|
||||
pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, il, "weight"));
|
||||
pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, il, "bias"));
|
||||
pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, il, "weight"));
|
||||
pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, il, "bias"));
|
||||
pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, il, "weight"));
|
||||
pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, il, "bias"));
|
||||
pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, il, "weight"));
|
||||
pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, il, "bias"));
|
||||
pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, il, "weight"));
|
||||
pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, il, "bias"));
|
||||
|
||||
pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, il, "weight"));
|
||||
pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, il, "bias"));
|
||||
pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, il, "weight"));
|
||||
pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, il, "bias"));
|
||||
pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, il, "weight"));
|
||||
pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, il, "bias"));
|
||||
pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, prefix, il, "weight"));
|
||||
pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, prefix, il, "bias"));
|
||||
pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, il, "weight"));
|
||||
pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, il, "bias"));
|
||||
pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, prefix, il, "weight"));
|
||||
pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, prefix, il, "bias"));
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE4_VISION:
|
||||
{
|
||||
// image_newline lives at the top-level.
|
||||
model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
|
||||
|
||||
// Load separate layerwise and spatial projector tensors
|
||||
const auto projector_count = hparams.vision_feature_layer.size();
|
||||
model.qf_proj_blocks.resize(projector_count);
|
||||
for (size_t bid = 0; bid < projector_count; ++bid) {
|
||||
auto & b = model.qf_proj_blocks[bid];
|
||||
|
||||
// non-layerwise tensors
|
||||
b.qf_proj_img_pos = get_tensor(string_format(TN_MULTI_PROJ_IMG_POS, bid));
|
||||
b.qf_proj_query = get_tensor(string_format(TN_MULTI_PROJ_QUERY, prefix, bid));
|
||||
b.qf_proj_linear_w = get_tensor(string_format(TN_MULTI_PROJ_LINEAR, prefix, bid, "weight"));
|
||||
b.qf_proj_linear_b = get_tensor(string_format(TN_MULTI_PROJ_LINEAR, prefix, bid, "bias"));
|
||||
b.qf_proj_norm_w = get_tensor(string_format(TN_MULTI_PROJ_NORM, prefix, bid, "weight"));
|
||||
b.qf_proj_norm_b = get_tensor(string_format(TN_MULTI_PROJ_NORM, prefix, bid, "bias"));
|
||||
b.qf_proj_post_norm_w = get_tensor(string_format(TN_MULTI_PROJ_POST_NORM, prefix, bid, "weight"));
|
||||
b.qf_proj_post_norm_b = get_tensor(string_format(TN_MULTI_PROJ_POST_NORM, prefix, bid, "bias"));
|
||||
|
||||
// laywerwise tensors
|
||||
// NOTE: If any model uses multi-layer qformers, this will need to change
|
||||
b.qf_proj_layers.resize(1);
|
||||
auto & pl = b.qf_proj_layers[0];
|
||||
|
||||
pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, bid, "weight"));
|
||||
pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, bid, "bias"));
|
||||
pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, bid, "weight"));
|
||||
pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, bid, "bias"));
|
||||
pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, bid, "weight"));
|
||||
pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, bid, "bias"));
|
||||
pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, bid, "weight"));
|
||||
pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, bid, "bias"));
|
||||
pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, bid, "weight"));
|
||||
pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, bid, "bias"));
|
||||
|
||||
pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, bid, "weight"));
|
||||
pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, bid, "bias"));
|
||||
pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, bid, "weight"));
|
||||
pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, bid, "bias"));
|
||||
pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, bid, "weight"));
|
||||
pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, bid, "bias"));
|
||||
pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, bid, "weight"));
|
||||
pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, bid, "bias"));
|
||||
pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, bid, "weight"));
|
||||
pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, bid, "bias"));
|
||||
|
||||
pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, prefix, bid, "weight"));
|
||||
pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, prefix, bid, "bias"));
|
||||
pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, bid, "weight"));
|
||||
pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, bid, "bias"));
|
||||
pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, prefix, bid, "weight"));
|
||||
pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, prefix, bid, "bias"));
|
||||
}
|
||||
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown projector type");
|
||||
}
|
||||
@@ -3085,10 +3160,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
|
||||
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
|
||||
}
|
||||
|
||||
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
||||
return ctx->model.image_newline;
|
||||
}
|
||||
|
||||
void clip_free(clip_ctx * ctx) {
|
||||
if (ctx == nullptr) {
|
||||
return;
|
||||
@@ -3397,6 +3468,23 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
const int ds = ctx->model.hparams.audio_proj_downsample_rate;
|
||||
n_patches = ((img->nx + ws - 1) / ws) * (ws / ds);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE4_VISION:
|
||||
{
|
||||
// Per-tile output token count: each projector block outputs
|
||||
// query_side^2 tokens per window × n^2 windows.
|
||||
// For 384×384 input: n = 24/8 = 3, query_side = 4 → 144.
|
||||
const int window_side = ctx->model.hparams.downsample_window_side;
|
||||
const int query_side = ctx->model.hparams.downsample_query_side;
|
||||
const int side = img->nx / params.patch_size;
|
||||
const int n = side / window_side;
|
||||
n_patches = (query_side * n) * (query_side * n);
|
||||
if (img->add_newline) {
|
||||
// For single-tile case: append 1 newline row.
|
||||
// For multi-tile rowwise: handled by caller, but here we
|
||||
// report the per-tile count including one trailing newline.
|
||||
n_patches += 1;
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported projector type");
|
||||
}
|
||||
@@ -4229,6 +4317,82 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
set_input_f32("attn_mask", mask);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE4_VISION:
|
||||
{
|
||||
// Granite Vision 4.1 uses precomputed permutation index
|
||||
// tensors to express the _win / _unwin / spatial sampling
|
||||
// reshapes as ggml_get_rows gathers. The names are set
|
||||
// by g4v_gather() in models/granite4-vision.cpp.
|
||||
const int patch_size = model.hparams.patch_size;
|
||||
const int image_side = imgs.entries.front()->nx / patch_size;
|
||||
const int window_side = hparams.downsample_window_side;
|
||||
const int query_side = hparams.downsample_query_side;
|
||||
const int n = image_side / window_side;
|
||||
const int new_side = n * query_side;
|
||||
|
||||
// Builds the raster→window permutation indices for a
|
||||
// (side, side) grid split into (n × n) windows of (win × win)
|
||||
// tokens each. dst[w * win*win + p] = source raster index.
|
||||
auto make_win_idx = [](int side, int win) {
|
||||
const int nn = side / win;
|
||||
std::vector<int32_t> idx(static_cast<size_t>(side) * side);
|
||||
for (int wy = 0; wy < nn; ++wy) {
|
||||
for (int wx = 0; wx < nn; ++wx) {
|
||||
for (int iy = 0; iy < win; ++iy) {
|
||||
for (int ix = 0; ix < win; ++ix) {
|
||||
const int w = wy * nn + wx;
|
||||
const int p = iy * win + ix;
|
||||
const int y = wy * win + iy;
|
||||
const int x = wx * win + ix;
|
||||
idx[static_cast<size_t>(w) * (win*win) + p] = y * side + x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return idx;
|
||||
};
|
||||
|
||||
auto make_unwin_idx = [&](int side, int win) {
|
||||
const std::vector<int32_t> fwd = make_win_idx(side, win);
|
||||
std::vector<int32_t> inv(fwd.size());
|
||||
for (size_t i = 0; i < fwd.size(); ++i) {
|
||||
inv[fwd[i]] = static_cast<int32_t>(i);
|
||||
}
|
||||
return inv;
|
||||
};
|
||||
|
||||
auto make_spatial_idx = [](int side, int offset) {
|
||||
const int off_y = (offset >> 1) & 1;
|
||||
const int off_x = offset & 1;
|
||||
const int new_s = side / 2;
|
||||
std::vector<int32_t> idx(static_cast<size_t>(new_s) * new_s);
|
||||
for (int y = 0; y < new_s; ++y) {
|
||||
for (int x = 0; x < new_s; ++x) {
|
||||
idx[y * new_s + x] = (y * 2 + off_y) * side + (x * 2 + off_x);
|
||||
}
|
||||
}
|
||||
return idx;
|
||||
};
|
||||
|
||||
auto upload = [&](const std::string & name, const std::vector<int32_t> & idx) {
|
||||
ggml_tensor * t = ggml_graph_get_tensor(gf, name.c_str());
|
||||
GGML_ASSERT(t);
|
||||
ggml_backend_tensor_set(t, idx.data(), 0, idx.size() * sizeof(int32_t));
|
||||
};
|
||||
|
||||
// Stage 1b only uses block 0's permutations; future stages
|
||||
// will upload all blocks.
|
||||
for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) {
|
||||
const std::string prefix = "g4v_blk" + std::to_string(bid) + "_";
|
||||
upload(prefix + "win_idx", make_win_idx(image_side, window_side));
|
||||
upload(prefix + "qwin_idx", make_win_idx(new_side, query_side));
|
||||
upload(prefix + "unwin_idx", make_unwin_idx(new_side, query_side));
|
||||
const auto spatial_offset = hparams.proj_spatial_offsets[bid];
|
||||
if (spatial_offset >= 0) {
|
||||
upload(prefix + "spatial_idx", make_spatial_idx(image_side,spatial_offset));
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("Unknown projector type");
|
||||
}
|
||||
@@ -4384,7 +4548,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
return ctx->model.position_embeddings->ne[0];
|
||||
case PROJECTOR_TYPE_GRANITE_SPEECH:
|
||||
return ctx->model.qf_proj_linear_w->ne[1];
|
||||
return ctx->model.qf_proj_blocks[0].qf_proj_linear_w->ne[1];
|
||||
case PROJECTOR_TYPE_GRANITE4_VISION:
|
||||
return ctx->model.qf_proj_blocks.size() * ctx->model.hparams.projection_dim;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
return ctx->model.mm_ffn_down_w->ne[1];
|
||||
default:
|
||||
|
||||
@@ -100,8 +100,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
|
||||
*/
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||
|
||||
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||
|
||||
|
||||
@@ -199,8 +199,8 @@ ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
|
||||
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
|
||||
|
||||
ggml_tensor * queries = build_norm(model.qf_proj_query,
|
||||
model.qf_proj_norm_w, model.qf_proj_norm_b,
|
||||
ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query,
|
||||
model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b,
|
||||
NORM_TYPE_NORMAL, proj_eps, -1);
|
||||
{
|
||||
ggml_tensor * q_3d = ggml_reshape_3d(ctx0, queries, n_embd, num_queries, 1);
|
||||
@@ -209,8 +209,8 @@ ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
queries = ggml_repeat(ctx0, q_3d, q_shape);
|
||||
}
|
||||
|
||||
for (int il = 0; il < (int)model.qf_proj_layers.size(); il++) {
|
||||
const auto & pl = model.qf_proj_layers[il];
|
||||
for (int il = 0; il < (int)model.qf_proj_blocks[0].qf_proj_layers.size(); il++) {
|
||||
const auto & pl = model.qf_proj_blocks[0].qf_proj_layers[il];
|
||||
|
||||
// self-attention
|
||||
{
|
||||
@@ -265,7 +265,7 @@ ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, queries, n_embd, num_queries * nblocks_proj);
|
||||
cur = ggml_add(ctx0, build_mm(model.qf_proj_linear_w, cur), model.qf_proj_linear_b);
|
||||
cur = ggml_add(ctx0, build_mm(model.qf_proj_blocks[0].qf_proj_linear_w, cur), model.qf_proj_blocks[0].qf_proj_linear_b);
|
||||
cb(cur, "projector_out", -1);
|
||||
}
|
||||
|
||||
|
||||
339
tools/mtmd/models/granite4-vision.cpp
Normal file
339
tools/mtmd/models/granite4-vision.cpp
Normal file
@@ -0,0 +1,339 @@
|
||||
#include "models.h"
|
||||
#include "../clip-impl.h"
|
||||
#include "../clip-model.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/*
|
||||
* Granite Vision 4.1 clip graph
|
||||
*
|
||||
* Stage 1a: SigLIP vision tower (N layers, post-norm)
|
||||
* Stage 1b: WindowQFormer blocks (deepstack + spatial)
|
||||
* Stage 1c: Concatenate and pack outputs
|
||||
* Stage 1d: Append newline tokens if add_newline is set
|
||||
*/
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Member method implementations
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
ggml_tensor * clip_graph_granite4_vision::gather(
|
||||
ggml_tensor * src,
|
||||
const std::string & name,
|
||||
int idx_len) {
|
||||
ggml_tensor * idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, idx_len);
|
||||
ggml_set_name(idx, name.c_str());
|
||||
ggml_set_input(idx);
|
||||
return ggml_get_rows(ctx0, src, idx);
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph_granite4_vision::interp_down(
|
||||
ggml_tensor * src,
|
||||
int side,
|
||||
int new_side) {
|
||||
const int n_embd = src->ne[0];
|
||||
ggml_tensor * t = ggml_reshape_4d(ctx0, src, n_embd, side, side, 1);
|
||||
t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 0, 1, 3));
|
||||
const int kernel = side / new_side;
|
||||
t = ggml_pool_2d(ctx0, t, GGML_OP_POOL_AVG, kernel, kernel, kernel, kernel, 0, 0);
|
||||
t = ggml_cont(ctx0, ggml_permute(ctx0, t, 1, 2, 0, 3));
|
||||
return ggml_reshape_2d(ctx0, t, n_embd, new_side * new_side);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// build_block - WindowQFormer block implementation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
ggml_tensor * clip_graph_granite4_vision::build_block(
|
||||
const qf_block & blk,
|
||||
ggml_tensor * h,
|
||||
int bid,
|
||||
int spatial_offset,
|
||||
int image_side,
|
||||
int window_side,
|
||||
int query_side,
|
||||
float qformer_eps) {
|
||||
|
||||
const int n_embd = h->ne[0];
|
||||
GGML_ASSERT(h->ne[1] == image_side * image_side);
|
||||
const int n = image_side / window_side;
|
||||
const int new_side = n * query_side;
|
||||
const int n_windows = n * n;
|
||||
const int enc_len = window_side * window_side;
|
||||
const int query_len = query_side * query_side;
|
||||
|
||||
auto cbx = [&](ggml_tensor * & t, const char * step) {
|
||||
const std::string name = "g4v_blk" + std::to_string(bid) + "_" + step;
|
||||
ggml_set_name(t, name.c_str());
|
||||
};
|
||||
|
||||
// 1. Top-level LN
|
||||
cbx(h, "inp");
|
||||
ggml_tensor * x = build_norm(h, blk.qf_proj_norm_w, blk.qf_proj_norm_b, NORM_TYPE_NORMAL, eps, bid);
|
||||
cbx(x, "norm");
|
||||
|
||||
// 2. enc = _win(x, image_side, window_side)
|
||||
ggml_tensor * enc;
|
||||
{
|
||||
ggml_tensor * enc_flat = gather(x,
|
||||
"g4v_blk" + std::to_string(bid) + "_win_idx",
|
||||
image_side * image_side);
|
||||
enc = ggml_reshape_3d(ctx0, enc_flat, n_embd, enc_len, n_windows);
|
||||
}
|
||||
cbx(enc, "enc");
|
||||
|
||||
// 3. downsampled = downsampler(x)
|
||||
ggml_tensor * d;
|
||||
(void) spatial_offset;
|
||||
if (spatial_offset >= 0) {
|
||||
d = gather(x,
|
||||
"g4v_blk" + std::to_string(bid) + "_spatial_idx",
|
||||
new_side * new_side);
|
||||
} else {
|
||||
d = interp_down(x, image_side, new_side);
|
||||
}
|
||||
cbx(d, "downsampled");
|
||||
|
||||
// 4. query_embeds = query + _win(d, new_side, query_side)
|
||||
ggml_tensor * q_in;
|
||||
{
|
||||
ggml_tensor * dw_flat = gather(d,
|
||||
"g4v_blk" + std::to_string(bid) + "_qwin_idx",
|
||||
new_side * new_side);
|
||||
ggml_tensor * dw = ggml_reshape_3d(ctx0, dw_flat, n_embd, query_len, n_windows);
|
||||
q_in = ggml_add(ctx0, dw, blk.qf_proj_query);
|
||||
}
|
||||
cbx(q_in, "query_embeds");
|
||||
|
||||
// 5. encoder_embeds = enc + image_positions → (C, enc_len, n_windows)
|
||||
ggml_tensor * e_in = ggml_add(ctx0, enc, blk.qf_proj_img_pos);
|
||||
cbx(e_in, "encoder_embeds");
|
||||
|
||||
// 6. Qformer forward.
|
||||
ggml_tensor * q = build_norm(q_in, blk.qf_proj_post_norm_w, blk.qf_proj_post_norm_b, NORM_TYPE_NORMAL, qformer_eps, bid);
|
||||
|
||||
// Helper for linear projections with window batching
|
||||
auto linear = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) -> ggml_tensor * {
|
||||
ggml_tensor * t = ggml_reshape_2d(ctx0, x, x->ne[0], x->ne[1] * x->ne[2]);
|
||||
t = build_mm(w, t);
|
||||
if (b) t = ggml_add(ctx0, t, b);
|
||||
return t;
|
||||
};
|
||||
|
||||
// Get the single QFormer layer
|
||||
GGML_ASSERT(blk.qf_proj_layers.size() == 1);
|
||||
const auto & pl = blk.qf_proj_layers[0];
|
||||
|
||||
// 6a. Self-attention
|
||||
ggml_tensor * sa_out;
|
||||
{
|
||||
const int d_h = 64;
|
||||
const int n_head = n_embd / d_h;
|
||||
const int nq = q->ne[1];
|
||||
const float scale = 1.0f / std::sqrt((float) d_h);
|
||||
|
||||
ggml_tensor * Q = linear(q, pl.q_w, pl.q_b);
|
||||
ggml_tensor * K = linear(q, pl.k_w, pl.k_b);
|
||||
ggml_tensor * V = linear(q, pl.v_w, pl.v_b);
|
||||
|
||||
Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows);
|
||||
K = ggml_reshape_4d(ctx0, K, d_h, n_head, nq, n_windows);
|
||||
V = ggml_reshape_4d(ctx0, V, d_h, n_head, nq, n_windows);
|
||||
|
||||
sa_out = build_attn(pl.o_w, pl.o_b, Q, K, V, nullptr, scale, bid);
|
||||
sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, nq, n_windows);
|
||||
|
||||
sa_out = ggml_add(ctx0, sa_out, q);
|
||||
sa_out = build_norm(sa_out, pl.ln_1_w, pl.ln_1_b,
|
||||
NORM_TYPE_NORMAL, qformer_eps, bid);
|
||||
}
|
||||
cbx(sa_out, "sa_out");
|
||||
|
||||
// 6b. Cross-attention
|
||||
ggml_tensor * ca_out;
|
||||
{
|
||||
const int d_h = 64;
|
||||
const int n_head = n_embd / d_h;
|
||||
const int nq = sa_out->ne[1];
|
||||
const int nkv = e_in->ne[1];
|
||||
const float scale = 1.0f / std::sqrt((float) d_h);
|
||||
|
||||
ggml_tensor * Q = linear(sa_out, pl.cross_attn_q_w, pl.cross_attn_q_b);
|
||||
ggml_tensor * K = linear(e_in, pl.cross_attn_k_w, pl.cross_attn_k_b);
|
||||
ggml_tensor * V = linear(e_in, pl.cross_attn_v_w, pl.cross_attn_v_b);
|
||||
|
||||
Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows);
|
||||
K = ggml_reshape_4d(ctx0, K, d_h, n_head, nkv, n_windows);
|
||||
V = ggml_reshape_4d(ctx0, V, d_h, n_head, nkv, n_windows);
|
||||
|
||||
ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b,
|
||||
Q, K, V, nullptr, scale, bid);
|
||||
ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, nq, n_windows);
|
||||
|
||||
ca_out = ggml_add(ctx0, ca_out, sa_out);
|
||||
ca_out = build_norm(ca_out, pl.cross_attn_norm_w, pl.cross_attn_norm_b,
|
||||
NORM_TYPE_NORMAL, qformer_eps, bid);
|
||||
}
|
||||
cbx(ca_out, "ca_out");
|
||||
|
||||
// 6c. FFN
|
||||
ggml_tensor * ffn;
|
||||
{
|
||||
ggml_tensor * t = ggml_reshape_2d(ctx0, ca_out, n_embd, query_len * n_windows);
|
||||
t = build_mm(pl.ff_up_w, t);
|
||||
if (pl.ff_up_b) t = ggml_add(ctx0, t, pl.ff_up_b);
|
||||
t = ggml_gelu_erf(ctx0, t);
|
||||
t = build_mm(pl.ff_down_w, t);
|
||||
if (pl.ff_down_b) t = ggml_add(ctx0, t, pl.ff_down_b);
|
||||
t = ggml_reshape_3d(ctx0, t, n_embd, query_len, n_windows);
|
||||
ffn = ggml_add(ctx0, t, ca_out);
|
||||
ffn = build_norm(ffn, pl.ln_2_w, pl.ln_2_b, NORM_TYPE_NORMAL, qformer_eps, bid);
|
||||
}
|
||||
cbx(ffn, "qformer_out");
|
||||
|
||||
// 7. _unwin back to raster
|
||||
ggml_tensor * unwinned;
|
||||
{
|
||||
ggml_tensor * flat = ggml_reshape_2d(ctx0, ffn, n_embd, query_len * n_windows);
|
||||
unwinned = gather(flat,
|
||||
"g4v_blk" + std::to_string(bid) + "_unwin_idx",
|
||||
new_side * new_side);
|
||||
}
|
||||
cbx(unwinned, "unwin");
|
||||
|
||||
// 8. out_linear
|
||||
ggml_tensor * out = build_mm(blk.qf_proj_linear_w, unwinned);
|
||||
if (blk.qf_proj_linear_b) out = ggml_add(ctx0, out, blk.qf_proj_linear_b);
|
||||
cbx(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// build() - top-level graph
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Build the K-tiled, base-scaled newline row tensor.
|
||||
// Shape: (n_mmproj_embd, 1)
|
||||
ggml_tensor * clip_graph_granite4_vision::build_newline_row(ggml_context * ctx0) {
|
||||
const int K = (int) model.qf_proj_blocks.size();
|
||||
GGML_ASSERT(K > 0);
|
||||
GGML_ASSERT(n_mmproj_embd % K == 0);
|
||||
const int projection_dim = n_mmproj_embd / K;
|
||||
GGML_ASSERT(model.image_newline != nullptr);
|
||||
GGML_ASSERT(ggml_nelements(model.image_newline) == projection_dim);
|
||||
|
||||
// Build newline_row[k*projection_dim + d] = nl[d] * (k == 0 ? base : 1.0)
|
||||
ggml_tensor * nl = model.image_newline; // (projection_dim,)
|
||||
ggml_tensor * nl_first_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1);
|
||||
ggml_tensor * nl_row_2d;
|
||||
if (K == 1) {
|
||||
nl_row_2d = nl_first_2d;
|
||||
} else {
|
||||
ggml_tensor * nl_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1);
|
||||
ggml_tensor * rest_template = ggml_new_tensor_2d(
|
||||
ctx0, GGML_TYPE_F32, projection_dim, K - 1);
|
||||
ggml_tensor * nl_rest = ggml_repeat(ctx0, nl_2d, rest_template);
|
||||
nl_row_2d = ggml_concat(ctx0, nl_first_2d, nl_rest, 1); // (projection_dim, K)
|
||||
}
|
||||
nl_row_2d = ggml_cont(ctx0, nl_row_2d);
|
||||
return ggml_reshape_2d(ctx0, nl_row_2d, n_mmproj_embd, 1);
|
||||
}
|
||||
|
||||
// Append a single newline row at the end of the tile output.
|
||||
ggml_tensor * clip_graph_granite4_vision::append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output) {
|
||||
// For the single-tile case, append one newline row at the end.
|
||||
// For the multi-tile rowwise case, this will be called per-tile
|
||||
// (though currently only the single-tile path uses it).
|
||||
ggml_tensor * nl_row = build_newline_row(ctx0);
|
||||
return ggml_concat(ctx0, tile_output, nl_row, 1);
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_granite4_vision::build() {
|
||||
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
GGML_ASSERT(!model.qf_proj_blocks.empty());
|
||||
|
||||
// --- Stage 1a: SigLIP encoder producing intermediate hidden states ---
|
||||
ggml_tensor * inp = build_inp();
|
||||
inp = ggml_add(ctx0, inp, model.position_embeddings);
|
||||
cb(inp, "pos_embed", -1);
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
std::vector<ggml_tensor *> layer_outs(n_layer, nullptr);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const auto & layer = model.layers[il];
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||
|
||||
// Self-attention
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
if (layer.q_b) Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
if (layer.k_b) Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
if (layer.v_b) Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
inpL = cur;
|
||||
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
layer_outs[il] = cur;
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
// --- Stage 1b/1c: WindowQFormer blocks ---
|
||||
const int projector_count = hparams.vision_feature_layer.size();
|
||||
const float qformer_eps = 1e-12f;
|
||||
|
||||
ggml_tensor * mmproj = nullptr;
|
||||
for (int bid = 0; bid < projector_count; ++bid) {
|
||||
const auto & blk = model.qf_proj_blocks[bid];
|
||||
|
||||
int vlayer = hparams.vision_feature_layer[bid];
|
||||
GGML_ASSERT(vlayer >= 0 && vlayer < n_layer);
|
||||
ggml_tensor * h = layer_outs[vlayer];
|
||||
|
||||
ggml_tensor * stream = build_block(
|
||||
blk, h, bid,
|
||||
hparams.proj_spatial_offsets[bid],
|
||||
n_patches_x,
|
||||
hparams.downsample_window_side,
|
||||
hparams.downsample_query_side,
|
||||
qformer_eps);
|
||||
cb(stream, (std::string("proj_") + std::to_string(bid) + std::string("_v_out")).c_str(), vlayer);
|
||||
mmproj = mmproj ? ggml_concat(ctx0, mmproj, stream, 0) : stream;
|
||||
}
|
||||
|
||||
// --- Stage 1d: Append newline tokens if add_newline is set ---
|
||||
if (add_newline) {
|
||||
mmproj = append_rowwise_newlines(ctx0, mmproj);
|
||||
ggml_set_name(mmproj, "g4v_mmproj_out_nl");
|
||||
} else {
|
||||
ggml_set_name(mmproj, "g4v_mmproj_out");
|
||||
}
|
||||
ggml_build_forward_expand(gf, mmproj);
|
||||
|
||||
return gf;
|
||||
}
|
||||
@@ -51,7 +51,6 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
}
|
||||
|
||||
std::vector<ggml_tensor *> embedding_stack;
|
||||
const auto & vision_feature_layer = hparams.vision_feature_layer;
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < max_feature_layer; il++) {
|
||||
@@ -60,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
|
||||
// If this is an embedding feature layer, save the output.
|
||||
// NOTE: 0 index here refers to the input to the encoder.
|
||||
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
|
||||
if (hparams.is_vision_feature_layer(il)) {
|
||||
embedding_stack.push_back(cur);
|
||||
}
|
||||
|
||||
@@ -135,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
// process vision feature layers (used by granite)
|
||||
{
|
||||
// final layer is a vision feature layer
|
||||
if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
|
||||
if (hparams.is_vision_feature_layer(max_feature_layer)) {
|
||||
embedding_stack.push_back(inpL);
|
||||
}
|
||||
|
||||
|
||||
@@ -211,3 +211,26 @@ struct clip_graph_exaone4_5 : clip_graph {
|
||||
clip_graph_exaone4_5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_granite4_vision : clip_graph {
|
||||
clip_graph_granite4_vision(clip_ctx * ctx, const clip_image_f32 & img)
|
||||
: clip_graph(ctx, img),
|
||||
add_newline(img.add_newline) {}
|
||||
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
private:
|
||||
// The graph is per-tile since only batch-size 1 is supported in clip. As
|
||||
// such, this value is set at construct time based on the tile that will be
|
||||
// encoded, then used during build to determine how to handle newlines.
|
||||
const bool add_newline;
|
||||
|
||||
ggml_tensor * gather(ggml_tensor * src, const std::string & name, int idx_len);
|
||||
ggml_tensor * interp_down(ggml_tensor * src, int side, int new_side);
|
||||
ggml_tensor * build_block(const qf_block & blk, ggml_tensor * h, int bid,
|
||||
int spatial_offset, int image_side, int window_side,
|
||||
int query_side, float qformer_eps);
|
||||
|
||||
ggml_tensor * build_newline_row(ggml_context * ctx0);
|
||||
ggml_tensor * append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output);
|
||||
};
|
||||
|
||||
@@ -513,6 +513,12 @@ struct mtmd_context {
|
||||
img_end = "</vision>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE4_VISION:
|
||||
{
|
||||
img_beg = "<image>";
|
||||
img_end = "";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
|
||||
}
|
||||
@@ -808,6 +814,21 @@ struct mtmd_tokenizer {
|
||||
return 2;
|
||||
}
|
||||
|
||||
// Annotate llava-next style tiles so clip_n_output_tokens accounts
|
||||
// for per-tile newline injection.
|
||||
if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) {
|
||||
if (batch_f32.entries.size() == 1) {
|
||||
// Single-tile (overview only): append one newline row.
|
||||
batch_f32.entries[0]->add_newline = true;
|
||||
} else {
|
||||
// Multi-tile: overview gets no newline, grid tiles get one.
|
||||
batch_f32.entries[0]->add_newline = false;
|
||||
for (size_t i = 1; i < batch_f32.entries.size(); ++i) {
|
||||
batch_f32.entries[i]->add_newline = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handle llava-uhd style preprocessing
|
||||
const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
|
||||
if (
|
||||
@@ -872,9 +893,10 @@ struct mtmd_tokenizer {
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
size_t n_tokens = 0;
|
||||
for (const auto & entry : batch_f32.entries) {
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
|
||||
for (const auto & e : batch_f32.entries) {
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
|
||||
}
|
||||
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
@@ -1111,7 +1133,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
||||
|| proj_type == PROJECTOR_TYPE_MINICPMV
|
||||
|| proj_type == PROJECTOR_TYPE_GLM_EDGE
|
||||
|| proj_type == PROJECTOR_TYPE_INTERNVL
|
||||
|| proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
|
||||
|| proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2
|
||||
|| proj_type == PROJECTOR_TYPE_GRANITE4_VISION) {
|
||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||
const auto & entries = image_tokens->batch_f32.entries;
|
||||
// entries may have different token counts
|
||||
|
||||
Reference in New Issue
Block a user