mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-04-30 23:04:19 +02:00
Compare commits
14 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7474e00b34 | ||
|
|
7f323a589f | ||
|
|
3eac209319 | ||
|
|
a634d75d1b | ||
|
|
62d4250e52 | ||
|
|
0208355f42 | ||
|
|
d2a4ef05c6 | ||
|
|
15e6125a39 | ||
|
|
3b24d26c22 | ||
|
|
43dfd741a5 | ||
|
|
b064a51a4e | ||
|
|
053367d149 | ||
|
|
d8919424f1 | ||
|
|
7fef11766c |
3
.github/workflows/docker.yml
vendored
3
.github/workflows/docker.yml
vendored
@@ -42,8 +42,7 @@ jobs:
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||
# Note: the intel images are failing due to an out of disk space error
|
||||
# - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
||||
|
||||
@@ -2204,32 +2204,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj"}, "FILE",
|
||||
"path to a multimodal projector file. see tools/mtmd/README.md",
|
||||
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
||||
"note: if -hf is used, this argument can be omitted",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.path = value;
|
||||
}
|
||||
).set_examples(mmproj_examples));
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-url"}, "URL",
|
||||
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.url = value;
|
||||
}
|
||||
).set_examples(mmproj_examples));
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
||||
add_opt(common_arg(
|
||||
{"--no-mmproj"},
|
||||
"explicitly disable multimodal projector, useful when using -hf",
|
||||
[](common_params & params) {
|
||||
params.no_mmproj = true;
|
||||
}
|
||||
).set_examples(mmproj_examples));
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
|
||||
add_opt(common_arg(
|
||||
{"--no-mmproj-offload"},
|
||||
"do not offload multimodal projector to GPU",
|
||||
[](common_params & params) {
|
||||
params.mmproj_use_gpu = false;
|
||||
}
|
||||
).set_examples(mmproj_examples));
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
|
||||
add_opt(common_arg(
|
||||
{"--image"}, "FILE",
|
||||
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
||||
@@ -2436,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--no-op-offload"},
|
||||
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.no_op_offload = true;
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--lora"}, "FNAME",
|
||||
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
||||
|
||||
@@ -1113,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.offload_kqv = !params.no_kv_offload;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.op_offload = !params.no_op_offload;
|
||||
|
||||
if (params.reranking) {
|
||||
cparams.embeddings = true;
|
||||
|
||||
@@ -332,6 +332,7 @@ struct common_params {
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
||||
|
||||
bool single_turn = false; // single turn chat conversation
|
||||
|
||||
|
||||
@@ -189,6 +189,7 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
|
||||
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
|
||||
/* .use_approximate_greedy_tokenize_fn = */ false,
|
||||
/* .tokenize_user_data = */ vocab,
|
||||
/* .slices = */ nullptr,
|
||||
};
|
||||
|
||||
char error_buffer[1024];
|
||||
|
||||
@@ -426,7 +426,11 @@ class ModelBase:
|
||||
logger.warning(f"Failed to load model config from {dir_model}: {e}")
|
||||
logger.warning("Trying to load config.json instead")
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
config = json.load(f)
|
||||
if "llm_config" in config:
|
||||
# rename for InternVL
|
||||
config["text_config"] = config["llm_config"]
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
||||
@@ -794,6 +798,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
|
||||
# ref: https://huggingface.co/mistral-community/pixtral-12b
|
||||
res = "pixtral"
|
||||
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
|
||||
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
|
||||
res = "seed-coder"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2606,6 +2613,11 @@ class Qwen2Model(TextModel):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if self.hf_arch == "Qwen2Model":
|
||||
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
|
||||
if "language_model." in name:
|
||||
name = name.replace("language_model.", "") # for InternVL
|
||||
if name.startswith("mlp") or name.startswith("vision_model"):
|
||||
# skip visual tensors
|
||||
return []
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@@ -2709,6 +2721,62 @@ class Qwen2VLVisionModel(VisionModel):
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("InternVisionModel")
|
||||
class InternVisionModel(VisionModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
|
||||
# hidden_act
|
||||
if hparams["hidden_act"] == "silu":
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
elif hparams["hidden_act"] == "gelu":
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
else:
|
||||
raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
|
||||
# downsample_ratio
|
||||
downsample_ratio = self.global_config.get("downsample_ratio")
|
||||
assert downsample_ratio is not None
|
||||
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, name, n_dims # unused
|
||||
if ".patch_embd." in new_name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
if ".position_embd." in new_name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return False
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
if name.startswith("vision_model") or name.startswith("mlp"):
|
||||
# process visual tensors
|
||||
# correct name
|
||||
if name.startswith("vision_model"):
|
||||
name = "vision_tower." + name
|
||||
if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
# split QKV tensors if needed
|
||||
if ".qkv." in name:
|
||||
if data_torch.ndim == 2: # weight
|
||||
c3, _ = data_torch.shape
|
||||
else: # bias
|
||||
c3 = data_torch.shape[0]
|
||||
assert c3 % 3 == 0
|
||||
c = c3 // 3
|
||||
wq = data_torch[:c]
|
||||
wk = data_torch[c: c * 2]
|
||||
wv = data_torch[c * 2:]
|
||||
return [
|
||||
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq),
|
||||
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk),
|
||||
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv),
|
||||
]
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("WavTokenizerDec")
|
||||
class WavTokenizerDecModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
|
||||
@@ -3360,6 +3428,11 @@ class InternLM2Model(TextModel):
|
||||
head_dim = n_embd // num_heads
|
||||
num_groups = num_heads // q_per_kv
|
||||
|
||||
name = name.replace("language_model.", "") # InternVL
|
||||
if name.startswith("mlp") or name.startswith("vision_model"):
|
||||
# skip visual tensors
|
||||
return []
|
||||
|
||||
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
|
||||
qkv = data_torch
|
||||
|
||||
@@ -3433,6 +3506,10 @@ class InternLM3Model(TextModel):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
name = name.replace("language_model.", "") # InternVL
|
||||
if name.startswith("mlp") or name.startswith("vision_model"):
|
||||
# skip visual tensors
|
||||
return []
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
|
||||
@@ -116,6 +116,7 @@ models = [
|
||||
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
|
||||
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
|
||||
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
|
||||
|
||||
To enable it, can use use one of the 2 methods below:
|
||||
|
||||
- Use `-hf` option with a [supported model](../../docs/multimodal.md)
|
||||
- Use `-hf` option with a supported model (see a list of pre-quantized model below)
|
||||
- To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
|
||||
- To load a model using `-hf` while using a custom mmproj file, use `--mmproj local_file.gguf`
|
||||
- Use `-m model.gguf` option with `--mmproj file.gguf` to specify text and multimodal projector respectively
|
||||
@@ -66,4 +66,12 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||
|
||||
# Mistral Small 3.1 24B (IQ2_M quantization)
|
||||
(tool_name) -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
|
||||
|
||||
# InternVL 2.5 and 3
|
||||
(tool_name) -hf ggml-org/InternVL2_5-1B-GGUF
|
||||
(tool_name) -hf ggml-org/InternVL2_5-4B-GGUF
|
||||
(tool_name) -hf ggml-org/InternVL3-1B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
|
||||
```
|
||||
|
||||
@@ -248,7 +248,7 @@ extern "C" {
|
||||
// preferrably to run on the same backend as the buffer
|
||||
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
|
||||
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
|
||||
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
|
||||
|
||||
// initialize buffers from a max size graph (optional)
|
||||
reserve_graph = build_graph(sched, max_batch_size);
|
||||
@@ -289,7 +289,7 @@ extern "C" {
|
||||
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
||||
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
|
||||
// Initialize backend buffers from a measure graph
|
||||
|
||||
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
|
||||
char * context_buffer;
|
||||
size_t context_buffer_size;
|
||||
|
||||
bool op_offload;
|
||||
|
||||
int debug;
|
||||
};
|
||||
|
||||
@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||
// check if a backend with higher prio wants to offload the op
|
||||
if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
|
||||
if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
|
||||
for (int b = 0; b < src_backend_id; b++) {
|
||||
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||
SET_CAUSE(tensor, "1.off");
|
||||
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
ggml_backend_buffer_type_t * bufts,
|
||||
int n_backends,
|
||||
size_t graph_size,
|
||||
bool parallel) {
|
||||
bool parallel,
|
||||
bool op_offload) {
|
||||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
}
|
||||
|
||||
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
||||
sched->op_offload = op_offload;
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
|
||||
@@ -546,7 +546,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
const int i0_stop = i0_start + 2*c::nbatch_V2 < DV ? i0_start + 2*c::nbatch_V2 : DV;
|
||||
const int i0_diff = i0_stop - i0_start;
|
||||
|
||||
if (nstages == 1) {
|
||||
if (nstages <= 1) {
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(V_h2 + k_VKQ_0*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V);
|
||||
@@ -874,6 +874,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Write back combined meta data:
|
||||
#pragma unroll
|
||||
for (int imeta = 0; imeta < nmeta; ++imeta) {
|
||||
|
||||
@@ -168,6 +168,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ[j*D + tid] = -HALF_MAX_HALF;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||
|
||||
|
||||
@@ -1909,13 +1909,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||
|
||||
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
|
||||
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
|
||||
// Therefore, in such cases use cuBLAS.
|
||||
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
|
||||
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
|
||||
|
||||
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
bool any_gpus_with_slow_fp16 = false;
|
||||
|
||||
@@ -91,11 +91,11 @@ void ggml_cuda_mul_mat_q(
|
||||
|
||||
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
const size_t size_data = ggml_nbytes(src0);
|
||||
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||
if (size_alloc > size_data) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(
|
||||
|
||||
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
const size_t size_data = ggml_nbytes(src0);
|
||||
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||
if (size_alloc > size_data) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -483,7 +483,9 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_ENC_EMBD_PATCH = auto()
|
||||
V_ENC_EMBD_POS = auto()
|
||||
V_ENC_ATTN_Q = auto()
|
||||
V_ENC_ATTN_Q_NORM = auto()
|
||||
V_ENC_ATTN_K = auto()
|
||||
V_ENC_ATTN_K_NORM = auto()
|
||||
V_ENC_ATTN_V = auto()
|
||||
V_ENC_INPUT_NORM = auto()
|
||||
V_ENC_OUTPUT = auto()
|
||||
@@ -491,6 +493,8 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_ENC_FFN_UP = auto()
|
||||
V_ENC_FFN_GATE = auto()
|
||||
V_ENC_FFN_DOWN = auto()
|
||||
V_LAYER_SCALE_1 = auto()
|
||||
V_LAYER_SCALE_2 = auto()
|
||||
V_PRE_NORM = auto()
|
||||
V_POST_NORM = auto()
|
||||
V_MM_INP_NORM = auto()
|
||||
@@ -740,7 +744,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm",
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm",
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1",
|
||||
MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out",
|
||||
@@ -748,6 +754,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
|
||||
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
||||
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
||||
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
||||
@@ -778,7 +786,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q,
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
|
||||
MODEL_TENSOR.V_ENC_ATTN_K,
|
||||
MODEL_TENSOR.V_ENC_ATTN_K_NORM,
|
||||
MODEL_TENSOR.V_ENC_ATTN_V,
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
||||
MODEL_TENSOR.V_ENC_OUTPUT,
|
||||
@@ -786,6 +796,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.V_ENC_FFN_UP,
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE,
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1,
|
||||
MODEL_TENSOR.V_LAYER_SCALE_2,
|
||||
MODEL_TENSOR.V_PRE_NORM,
|
||||
MODEL_TENSOR.V_POST_NORM,
|
||||
MODEL_TENSOR.V_MM_INP_PROJ,
|
||||
@@ -2167,6 +2179,7 @@ class VisionProjectorType:
|
||||
PIXTRAL = "pixtral"
|
||||
QWEN2VL = "qwen2vl_merger"
|
||||
QWEN25VL = "qwen2.5vl_merger"
|
||||
INTERNVL = "internvl"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -905,6 +905,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||
"model.mm_projector.mlp.mlp.{bid}",
|
||||
"mlp1.{bid}", # InternVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_PEG: (
|
||||
@@ -937,6 +938,10 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"vpm.encoder.layers.{bid}.self_attn.k_proj",
|
||||
@@ -945,6 +950,10 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"vpm.encoder.layers.{bid}.self_attn.v_proj",
|
||||
@@ -955,6 +964,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
|
||||
"vpm.encoder.layers.{bid}.layer_norm1",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
|
||||
@@ -963,6 +973,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.V_ENC_OUTPUT: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
|
||||
"vpm.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
|
||||
@@ -971,6 +982,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
|
||||
"vpm.encoder.layers.{bid}.layer_norm2",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
|
||||
@@ -1000,6 +1012,14 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_LAYER_SCALE_2: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_PRE_NORM: (
|
||||
"vision_tower.vision_model.pre_layrnorm",
|
||||
"vision_tower.ln_pre", # pixtral
|
||||
|
||||
@@ -112,6 +112,7 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||
};
|
||||
|
||||
enum llama_rope_type {
|
||||
@@ -362,6 +363,7 @@ extern "C" {
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
bool op_offload; // whether to offload host tensor operations to device
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
|
||||
@@ -93,6 +93,7 @@ llama_context::llama_context(
|
||||
}
|
||||
|
||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||
cparams.op_offload = params.op_offload;
|
||||
|
||||
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||
|
||||
@@ -243,7 +244,7 @@ llama_context::llama_context(
|
||||
}
|
||||
}
|
||||
|
||||
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
|
||||
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload));
|
||||
|
||||
if (pipeline_parallel) {
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
|
||||
@@ -1871,6 +1872,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
/*.op_offload =*/ true,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
||||
@@ -30,6 +30,7 @@ struct llama_cparams {
|
||||
bool flash_attn;
|
||||
bool no_perf;
|
||||
bool warmup;
|
||||
bool op_offload;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
|
||||
@@ -415,6 +415,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
@@ -1634,6 +1641,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "bailingmoe") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "seed-coder") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
|
||||
@@ -853,7 +853,7 @@ int main(void) {
|
||||
backends_modded.insert(backends_modded.end(), backends.begin(), backends.end());
|
||||
|
||||
ggml_backend_sched_t backend_sched = ggml_backend_sched_new(
|
||||
backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false);
|
||||
backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true);
|
||||
|
||||
printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i]));
|
||||
printf(" Device description: %s\n", ggml_backend_dev_description(devs[i]));
|
||||
|
||||
@@ -219,6 +219,7 @@ struct cmd_params {
|
||||
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
||||
std::vector<bool> use_mmap;
|
||||
std::vector<bool> embeddings;
|
||||
std::vector<bool> no_op_offload;
|
||||
ggml_numa_strategy numa;
|
||||
int reps;
|
||||
ggml_sched_priority prio;
|
||||
@@ -253,6 +254,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
|
||||
/* use_mmap */ { true },
|
||||
/* embeddings */ { false },
|
||||
/* no_op_offload */ { false },
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
||||
@@ -311,6 +313,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
join(cmd_params_defaults.embeddings, ",").c_str());
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n");
|
||||
printf(" -nopo, --no-op-offload <i> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
||||
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
||||
@@ -588,6 +591,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
}
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
|
||||
} else if (arg == "-nopo" || arg == "--no-op-offload") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ts" || arg == "--tensor-split") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -794,6 +804,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.embeddings.empty()) {
|
||||
params.embeddings = cmd_params_defaults.embeddings;
|
||||
}
|
||||
if (params.no_op_offload.empty()) {
|
||||
params.no_op_offload = cmd_params_defaults.no_op_offload;
|
||||
}
|
||||
if (params.n_threads.empty()) {
|
||||
params.n_threads = cmd_params_defaults.n_threads;
|
||||
}
|
||||
@@ -833,6 +846,7 @@ struct cmd_params_instance {
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
bool use_mmap;
|
||||
bool embeddings;
|
||||
bool no_op_offload;
|
||||
|
||||
llama_model_params to_llama_mparams() const {
|
||||
llama_model_params mparams = llama_model_default_params();
|
||||
@@ -902,6 +916,7 @@ struct cmd_params_instance {
|
||||
cparams.offload_kqv = !no_kv_offload;
|
||||
cparams.flash_attn = flash_attn;
|
||||
cparams.embeddings = embeddings;
|
||||
cparams.op_offload = !no_op_offload;
|
||||
|
||||
return cparams;
|
||||
}
|
||||
@@ -921,6 +936,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & ot : params.tensor_buft_overrides)
|
||||
for (const auto & mmp : params.use_mmap)
|
||||
for (const auto & embd : params.embeddings)
|
||||
for (const auto & nopo : params.no_op_offload)
|
||||
for (const auto & nb : params.n_batch)
|
||||
for (const auto & nub : params.n_ubatch)
|
||||
for (const auto & tk : params.type_k)
|
||||
@@ -959,6 +975,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
/* .no_op_offload= */ nopo,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -990,6 +1007,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
/* .no_op_offload= */ nopo,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -1021,6 +1039,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
/* .no_op_offload= */ nopo,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -1056,6 +1075,7 @@ struct test {
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
bool use_mmap;
|
||||
bool embeddings;
|
||||
bool no_op_offload;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
int n_depth;
|
||||
@@ -1089,6 +1109,7 @@ struct test {
|
||||
tensor_buft_overrides = inst.tensor_buft_overrides;
|
||||
use_mmap = inst.use_mmap;
|
||||
embeddings = inst.embeddings;
|
||||
no_op_offload = inst.no_op_offload;
|
||||
n_prompt = inst.n_prompt;
|
||||
n_gen = inst.n_gen;
|
||||
n_depth = inst.n_depth;
|
||||
@@ -1134,7 +1155,7 @@ struct test {
|
||||
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
||||
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
||||
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
||||
"use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time",
|
||||
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
|
||||
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
||||
};
|
||||
return fields;
|
||||
@@ -1146,7 +1167,7 @@ struct test {
|
||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
|
||||
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
|
||||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
|
||||
field == "avg_ns" || field == "stddev_ns") {
|
||||
field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
||||
@@ -1222,6 +1243,7 @@ struct test {
|
||||
tensor_buft_overrides_str,
|
||||
std::to_string(use_mmap),
|
||||
std::to_string(embeddings),
|
||||
std::to_string(no_op_offload),
|
||||
std::to_string(n_prompt),
|
||||
std::to_string(n_gen),
|
||||
std::to_string(n_depth),
|
||||
@@ -1404,6 +1426,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "test") {
|
||||
return 15;
|
||||
}
|
||||
if (field == "no_op_offload") {
|
||||
return 4;
|
||||
}
|
||||
|
||||
int width = std::max((int) field.length(), 10);
|
||||
|
||||
@@ -1435,6 +1460,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "embeddings") {
|
||||
return "embd";
|
||||
}
|
||||
if (field == "no_op_offload") {
|
||||
return "nopo";
|
||||
}
|
||||
if (field == "tensor_split") {
|
||||
return "ts";
|
||||
}
|
||||
@@ -1503,6 +1531,9 @@ struct markdown_printer : public printer {
|
||||
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
|
||||
fields.emplace_back("embeddings");
|
||||
}
|
||||
if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
|
||||
fields.emplace_back("no_op_offload");
|
||||
}
|
||||
fields.emplace_back("test");
|
||||
fields.emplace_back("t/s");
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ endif()
|
||||
|
||||
add_library(mtmd OBJECT
|
||||
mtmd.cpp
|
||||
mtmd-helper.cpp
|
||||
mtmd.h
|
||||
clip.cpp
|
||||
clip.h
|
||||
|
||||
@@ -48,6 +48,7 @@ For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` fla
|
||||
- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
|
||||
- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen))
|
||||
- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
|
||||
- InternVL 2.5 and InternVL 3 from [OpenGVLab](https://huggingface.co/OpenGVLab) (note: we don't support conversion of `InternVL3-*-hf` model, only non-HF version is supported ; `InternLM2Model` **text** model is not supported)
|
||||
|
||||
For older models, please refer to the relevant guide for instructions on how to obtain or create them:
|
||||
|
||||
|
||||
@@ -33,9 +33,6 @@
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
|
||||
|
||||
#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl
|
||||
#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
@@ -56,12 +53,16 @@
|
||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
||||
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
|
||||
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
|
||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s"
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
|
||||
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
||||
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
@@ -93,6 +94,9 @@
|
||||
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
||||
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
@@ -105,6 +109,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
PROJECTOR_TYPE_QWEN25VL,
|
||||
PROJECTOR_TYPE_INTERNVL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -119,6 +124,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
@@ -174,6 +174,10 @@ struct clip_hparams {
|
||||
int32_t n_layer;
|
||||
int32_t proj_scale_factor = 0; // idefics3
|
||||
|
||||
// for models using dynamic image size, we need to have a smaller image size to warmup
|
||||
// otherwise, user will get OOM everytime they load the model
|
||||
int32_t warmup_image_size = 0;
|
||||
|
||||
ffn_op_type ffn_op = FFN_GELU;
|
||||
|
||||
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
|
||||
@@ -201,6 +205,9 @@ struct clip_layer {
|
||||
ggml_tensor * o_w = nullptr;
|
||||
ggml_tensor * o_b = nullptr;
|
||||
|
||||
ggml_tensor * k_norm = nullptr;
|
||||
ggml_tensor * q_norm = nullptr;
|
||||
|
||||
// layernorm 1
|
||||
ggml_tensor * ln_1_w = nullptr;
|
||||
ggml_tensor * ln_1_b = nullptr;
|
||||
@@ -215,6 +222,10 @@ struct clip_layer {
|
||||
// layernorm 2
|
||||
ggml_tensor * ln_2_w = nullptr;
|
||||
ggml_tensor * ln_2_b = nullptr;
|
||||
|
||||
// layer scale (no bias)
|
||||
ggml_tensor * ls_1_w = nullptr;
|
||||
ggml_tensor * ls_2_w = nullptr;
|
||||
};
|
||||
|
||||
struct clip_vision_model {
|
||||
@@ -372,7 +383,7 @@ struct clip_ctx {
|
||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
|
||||
|
||||
sched.reset(
|
||||
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
|
||||
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -589,6 +600,9 @@ struct clip_graph {
|
||||
|
||||
// Qwen2VL and Qwen2.5VL use M-RoPE
|
||||
ggml_cgraph * build_qwen2vl() {
|
||||
GGML_ASSERT(model.patch_bias == nullptr);
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
|
||||
const int batch_size = 1;
|
||||
const bool use_window_attn = hparams.n_wa_pattern > 0;
|
||||
const int n_wa_pattern = hparams.n_wa_pattern;
|
||||
@@ -625,10 +639,6 @@ struct clip_graph {
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
if (model.patch_bias) {
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
}
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
ggml_tensor * window_mask = nullptr;
|
||||
ggml_tensor * window_idx = nullptr;
|
||||
@@ -859,6 +869,67 @@ struct clip_graph {
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_cgraph * build_internvl() {
|
||||
GGML_ASSERT(model.class_embedding != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
const int n_pos = n_patches + 1;
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
// add CLS token
|
||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
model.position_embeddings,
|
||||
nullptr);
|
||||
|
||||
// remove CLS token
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, n_patches,
|
||||
ggml_row_size(cur->type, n_embd), 0);
|
||||
|
||||
// pixel shuffle
|
||||
{
|
||||
const int scale_factor = model.hparams.proj_scale_factor;
|
||||
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
||||
const int height = n_patches_y;
|
||||
const int width = n_patches_x;
|
||||
GGML_ASSERT(scale_factor > 0);
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
||||
n_embd * scale_factor * scale_factor,
|
||||
height / scale_factor,
|
||||
width / scale_factor,
|
||||
bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
// flatten to 2D
|
||||
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
|
||||
n_embd * scale_factor * scale_factor,
|
||||
cur->ne[1] * cur->ne[2]);
|
||||
}
|
||||
|
||||
// projector (always using GELU activation)
|
||||
{
|
||||
// projector LayerNorm uses pytorch's default eps = 1e-5
|
||||
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
||||
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_3_b);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
// this graph is used by llava, granite and glm
|
||||
// due to having embedding_stack (used by granite), we cannot reuse build_vit
|
||||
ggml_cgraph * build_llava() {
|
||||
@@ -890,10 +961,6 @@ struct clip_graph {
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
if (model.patch_bias) {
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
}
|
||||
|
||||
// concat class_embeddings and patch_embeddings
|
||||
if (model.class_embedding) {
|
||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||
@@ -1260,11 +1327,6 @@ private:
|
||||
ggml_tensor * learned_pos_embd,
|
||||
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
|
||||
) {
|
||||
if (model.patch_bias) {
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
cb(inp, "patch_bias", -1);
|
||||
}
|
||||
|
||||
if (learned_pos_embd) {
|
||||
inp = ggml_add(ctx0, inp, learned_pos_embd);
|
||||
cb(inp, "pos_embed", -1);
|
||||
@@ -1304,6 +1366,16 @@ private:
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm", il);
|
||||
}
|
||||
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
@@ -1324,6 +1396,11 @@ private:
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
if (layer.ls_1_w) {
|
||||
cur = ggml_mul(ctx0, cur, layer.ls_1_w);
|
||||
cb(cur, "attn_out_scaled", il);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
@@ -1344,6 +1421,11 @@ private:
|
||||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
if (layer.ls_2_w) {
|
||||
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
||||
cb(cur, "ffn_out_scaled", il);
|
||||
}
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
@@ -1365,6 +1447,10 @@ private:
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
if (model.patch_bias) {
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
cb(inp, "patch_bias", -1);
|
||||
}
|
||||
return inp;
|
||||
}
|
||||
|
||||
@@ -1627,6 +1713,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
res = graph.build_minicpmv();
|
||||
} break;
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
res = graph.build_internvl();
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
res = graph.build_llava();
|
||||
@@ -1723,6 +1813,9 @@ struct clip_model_loader {
|
||||
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
||||
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
|
||||
|
||||
// default warmup value
|
||||
hparams.warmup_image_size = hparams.image_size;
|
||||
|
||||
ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
|
||||
|| ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
|
||||
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDP
|
||||
@@ -1790,12 +1883,14 @@ struct clip_model_loader {
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
{
|
||||
hparams.rope_theta = 10000.0f;
|
||||
hparams.warmup_image_size = hparams.patch_size * 8;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
@@ -1806,8 +1901,19 @@ struct clip_model_loader {
|
||||
// test model (tinygemma3) has a different value, we optionally read it
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
{
|
||||
// max image size = sqrt(max_pixels)
|
||||
// https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
|
||||
hparams.image_size = 3584;
|
||||
hparams.warmup_image_size = hparams.patch_size * 8;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
{
|
||||
// max image size = sqrt(max_pixels)
|
||||
// https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
|
||||
hparams.image_size = 3584;
|
||||
hparams.warmup_image_size = hparams.patch_size * 8;
|
||||
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
|
||||
} break;
|
||||
default:
|
||||
@@ -1895,8 +2001,13 @@ struct clip_model_loader {
|
||||
layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
|
||||
layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight"));
|
||||
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
|
||||
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false);
|
||||
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false);
|
||||
layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
|
||||
layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
|
||||
layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias
|
||||
layer.ls_2_w = get_tensor(string_format(TN_LS_2, "v", il, "weight"), false); // no bias
|
||||
|
||||
layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false);
|
||||
layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);
|
||||
layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false);
|
||||
@@ -1904,7 +2015,7 @@ struct clip_model_loader {
|
||||
layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false);
|
||||
layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false);
|
||||
|
||||
// new naming
|
||||
// ffn
|
||||
layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
|
||||
layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
|
||||
layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
|
||||
@@ -2052,6 +2163,15 @@ struct clip_model_loader {
|
||||
vision_model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||
vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
|
||||
vision_model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
||||
vision_model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
||||
vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
||||
vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown projector type");
|
||||
}
|
||||
@@ -2099,13 +2219,14 @@ struct clip_model_loader {
|
||||
// create a fake batch
|
||||
clip_image_f32_batch batch;
|
||||
clip_image_f32_ptr img(clip_image_f32_init());
|
||||
img->nx = ctx_clip.vision_model.hparams.image_size;
|
||||
img->ny = ctx_clip.vision_model.hparams.image_size;
|
||||
img->nx = ctx_clip.vision_model.hparams.warmup_image_size;
|
||||
img->ny = ctx_clip.vision_model.hparams.warmup_image_size;
|
||||
img->buf.resize(img->nx * img->ny * 3);
|
||||
batch.entries.push_back(std::move(img));
|
||||
|
||||
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
|
||||
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
|
||||
|
||||
for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = ctx_clip.backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
|
||||
@@ -2504,8 +2625,8 @@ struct image_manipulation {
|
||||
float target_width_f = static_cast<float>(inp_size.width) * scale;
|
||||
float target_height_f = static_cast<float>(inp_size.height) * scale;
|
||||
|
||||
int aligned_width = GGML_PAD((int)target_width_f, align_size);
|
||||
int aligned_height = GGML_PAD((int)target_height_f, align_size);
|
||||
int aligned_width = CLIP_ALIGN((int)target_width_f, align_size);
|
||||
int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
|
||||
|
||||
return {aligned_width, aligned_height};
|
||||
}
|
||||
@@ -2824,10 +2945,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
||||
clip_image_u8 resized;
|
||||
auto patch_size = clip_get_patch_size(ctx) * 2;
|
||||
int nx = ceil((float)img->nx / patch_size) * patch_size;
|
||||
int ny = ceil((float)img->ny / patch_size) * patch_size;
|
||||
image_manipulation::bicubic_resize(*img, resized, nx, ny);
|
||||
auto patch_size = params.patch_size * 2;
|
||||
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
|
||||
image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height);
|
||||
|
||||
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
||||
// clip_image_f32_ptr res(clip_image_f32_init());
|
||||
@@ -2838,7 +2958,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
|
||||
|| ctx->proj_type == PROJECTOR_TYPE_GEMMA3
|
||||
|| ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
||||
|| ctx->proj_type == PROJECTOR_TYPE_IDEFICS3
|
||||
|| ctx->proj_type == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
|
||||
) {
|
||||
clip_image_u8 resized_image;
|
||||
int sz = params.image_size;
|
||||
image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
|
||||
@@ -2988,9 +3110,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
|
||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP
|
||||
|| ctx->proj_type == PROJECTOR_TYPE_LDPV2
|
||||
|| ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
||||
n_patches /= 4;
|
||||
n_patches += 2; // for BOI and EOI token embeddings
|
||||
if (ctx->vision_model.mm_glm_tok_boi) {
|
||||
n_patches += 2; // for BOI and EOI token embeddings
|
||||
}
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
||||
if (ctx->minicpmv_version == 2) {
|
||||
n_patches = 96;
|
||||
@@ -3013,7 +3139,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
int n_per_side = params.image_size / params.patch_size;
|
||||
int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
|
||||
n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3 || ctx->proj_type == PROJECTOR_TYPE_INTERNVL) {
|
||||
// both W and H are divided by proj_scale_factor
|
||||
n_patches /= (params.proj_scale_factor * params.proj_scale_factor);
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
||||
int n_merge = params.spatial_merge_size;
|
||||
@@ -3408,6 +3535,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
// do nothing
|
||||
} break;
|
||||
@@ -3434,6 +3562,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
// the last node is the embedding tensor
|
||||
ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
|
||||
// sanity check (only support batch size of 1 for now)
|
||||
const int n_tokens_out = embeddings->ne[1];
|
||||
const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
|
||||
if (n_tokens_out != expected_n_tokens_out) {
|
||||
LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
|
||||
GGML_ABORT("Invalid number of output tokens");
|
||||
}
|
||||
|
||||
// copy the embeddings to the location passed by the user
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
|
||||
@@ -3604,6 +3740,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.mm_input_proj_w->ne[0];
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
return ctx->vision_model.projection->ne[1];
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
return ctx->vision_model.mm_3_w->ne[1];
|
||||
default:
|
||||
GGML_ABORT("Unknown projector type");
|
||||
}
|
||||
|
||||
310
tools/mtmd/mtmd-helper.cpp
Normal file
310
tools/mtmd/mtmd-helper.cpp
Normal file
@@ -0,0 +1,310 @@
|
||||
#include "mtmd.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
#include <vector>
|
||||
|
||||
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
|
||||
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
|
||||
|
||||
size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
|
||||
size_t n_tokens = 0;
|
||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens_text;
|
||||
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
|
||||
n_tokens += n_tokens_text;
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
}
|
||||
return n_tokens;
|
||||
}
|
||||
|
||||
llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
|
||||
llama_pos n_pos = 0;
|
||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens_text;
|
||||
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
|
||||
n_pos += n_tokens_text;
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
}
|
||||
return n_pos;
|
||||
}
|
||||
|
||||
// helper struct to make working with embd batch easier
|
||||
// note: this will be removed after llama_batch_ext refactoring
|
||||
struct decode_embd_batch {
|
||||
int n_pos_per_embd;
|
||||
int n_mmproj_embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<llama_pos> pos_view; // used by mrope
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
}
|
||||
|
||||
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
int i = y * nx + x;
|
||||
pos[i ] = pos_0;
|
||||
pos[i + batch.n_tokens ] = pos_0 + y;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + x;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
pos_view.reserve(n_tokens * n_pos_per_embd);
|
||||
if (n_pos_per_embd > 1) {
|
||||
// mrope
|
||||
// for example, with layout of src: 1234...1234...1234...1234...
|
||||
// offset 2 will give us dst: 34...34...34...34...
|
||||
for (int i = 0; i < n_pos_per_embd; i++) {
|
||||
// assume n_tokens is less than or equal to batch.n_tokens
|
||||
// batch.n_tokens is number of **total** tokens
|
||||
// n_tokens is number of viewed token
|
||||
size_t src_idx = i * batch.n_tokens + offset;
|
||||
pos_view.insert(pos_view.end(),
|
||||
pos.data() + src_idx,
|
||||
pos.data() + src_idx + n_tokens);
|
||||
}
|
||||
pos_ptr = pos_view.data();
|
||||
} else {
|
||||
// normal
|
||||
pos_ptr = pos.data() + offset;
|
||||
}
|
||||
return {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
||||
/*pos =*/ pos_ptr,
|
||||
/*n_seq_id =*/ batch.n_seq_id + offset,
|
||||
/*seq_id =*/ batch.seq_id + offset,
|
||||
/*logits =*/ batch.logits + offset,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function for decoding an image whose embeddings have already been calculated
|
||||
int32_t mtmd_helper_decode_image_chunk(
|
||||
mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
float * encoded_embd,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past) {
|
||||
if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
|
||||
return -1;
|
||||
}
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
if (!image_tokens) {
|
||||
LOG_ERR("failed to decode image chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
const llama_model * model = llama_get_model(lctx);
|
||||
int n_mmproj_embd = llama_model_n_embd(model);
|
||||
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
||||
|
||||
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
|
||||
if (mtmd_decode_use_mrope(ctx)) {
|
||||
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
|
||||
} else {
|
||||
batch_embd.set_position_normal(n_past, seq_id);
|
||||
}
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, false);
|
||||
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
|
||||
}
|
||||
|
||||
while (i_batch < n_img_batches) { // split into batches
|
||||
int pos_offset = i_batch*n_batch;
|
||||
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
||||
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
|
||||
|
||||
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
int32_t ret = llama_decode(lctx, batch_embd_view);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
||||
return ret;
|
||||
}
|
||||
|
||||
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
|
||||
|
||||
i_batch++;
|
||||
}
|
||||
|
||||
n_past += mtmd_image_tokens_get_n_pos(image_tokens);
|
||||
*new_n_past = n_past;
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
int32_t ret;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens;
|
||||
const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
||||
// LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens);
|
||||
size_t i = 0;
|
||||
while (i < n_tokens) { // split into batches
|
||||
text_batch.n_tokens = 0; // clear the batch
|
||||
for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
|
||||
text_batch.n_tokens++;
|
||||
text_batch.token [i] = tokens[i];
|
||||
text_batch.pos [i] = n_past++;
|
||||
text_batch.n_seq_id[i] = 1;
|
||||
text_batch.seq_id [i][0] = seq_id;
|
||||
text_batch.logits [i] = false;
|
||||
}
|
||||
bool is_last_token = (i == n_tokens);
|
||||
if (logits_last && is_last_token) {
|
||||
text_batch.logits[text_batch.n_tokens - 1] = true;
|
||||
}
|
||||
ret = llama_decode(lctx, text_batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode text\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
*new_n_past += text_batch.n_tokens;
|
||||
}
|
||||
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
int64_t t0 = ggml_time_ms();
|
||||
|
||||
LOG_INF("encoding image or slice...\n");
|
||||
|
||||
ret = mtmd_encode(ctx, image_tokens);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to encode image\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("chunk type not supported");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunks * chunks,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
size_t n_chunks = mtmd_input_chunks_size(chunks);
|
||||
if (n_chunks == 0) {
|
||||
LOG_ERR("no chunks to eval\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < n_chunks; i++) {
|
||||
bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
|
||||
int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
|
||||
if (res != 0) {
|
||||
LOG_ERR("failed to eval chunk %zu\n", i);
|
||||
return res;
|
||||
}
|
||||
*new_n_past = n_past;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -252,6 +252,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
|
||||
}
|
||||
|
||||
else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
marker_modified = "<img>" + ctx->image_marker + "</img>";
|
||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
||||
|
||||
}
|
||||
|
||||
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
||||
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
|
||||
|
||||
@@ -454,307 +461,26 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
||||
return ctx->image_embd_v.data();
|
||||
}
|
||||
|
||||
size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
|
||||
size_t n_tokens = 0;
|
||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens_text;
|
||||
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
|
||||
n_tokens += n_tokens_text;
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
return true;
|
||||
}
|
||||
return n_tokens;
|
||||
return false;
|
||||
}
|
||||
|
||||
llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
|
||||
llama_pos n_pos = 0;
|
||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens_text;
|
||||
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
|
||||
n_pos += n_tokens_text;
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
}
|
||||
return n_pos;
|
||||
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
return ctx->use_mrope;
|
||||
}
|
||||
|
||||
// helper struct to make working with embd batch easier
|
||||
// note: this will be removed after llama_batch_ext refactoring
|
||||
struct decode_embd_batch {
|
||||
int n_pos_per_embd;
|
||||
int n_mmproj_embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<llama_pos> pos_view; // used by mrope
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
}
|
||||
|
||||
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
int i = y * nx + x;
|
||||
pos[i ] = pos_0;
|
||||
pos[i + batch.n_tokens ] = pos_0 + y;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + x;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
pos_view.reserve(n_tokens * n_pos_per_embd);
|
||||
if (n_pos_per_embd > 1) {
|
||||
// mrope
|
||||
// for example, with layout of src: 1234...1234...1234...1234...
|
||||
// offset 2 will give us dst: 34...34...34...34...
|
||||
for (int i = 0; i < n_pos_per_embd; i++) {
|
||||
// assume n_tokens is less than or equal to batch.n_tokens
|
||||
// batch.n_tokens is number of **total** tokens
|
||||
// n_tokens is number of viewed token
|
||||
size_t src_idx = i * batch.n_tokens + offset;
|
||||
pos_view.insert(pos_view.end(),
|
||||
pos.data() + src_idx,
|
||||
pos.data() + src_idx + n_tokens);
|
||||
}
|
||||
pos_ptr = pos_view.data();
|
||||
} else {
|
||||
// normal
|
||||
pos_ptr = pos.data() + offset;
|
||||
}
|
||||
return {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
||||
/*pos =*/ pos_ptr,
|
||||
/*n_seq_id =*/ batch.n_seq_id + offset,
|
||||
/*seq_id =*/ batch.seq_id + offset,
|
||||
/*logits =*/ batch.logits + offset,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function for decoding an image whose embeddings have already been calculated
|
||||
int32_t mtmd_helper_decode_image_chunk(
|
||||
mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
float * encoded_embd,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past) {
|
||||
if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
|
||||
return -1;
|
||||
}
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
if (!image_tokens) {
|
||||
LOG_ERR("failed to decode image chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
||||
|
||||
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
|
||||
if (mtmd_decode_use_mrope(ctx)) {
|
||||
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
|
||||
} else {
|
||||
batch_embd.set_position_normal(n_past, seq_id);
|
||||
}
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, false);
|
||||
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
|
||||
}
|
||||
|
||||
while (i_batch < n_img_batches) { // split into batches
|
||||
int pos_offset = i_batch*n_batch;
|
||||
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
||||
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
|
||||
|
||||
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
int32_t ret = llama_decode(lctx, batch_embd_view);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
|
||||
}
|
||||
|
||||
i_batch++;
|
||||
}
|
||||
|
||||
n_past += mtmd_image_tokens_get_n_pos(image_tokens);
|
||||
*new_n_past = n_past;
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
return 0;
|
||||
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
||||
mtmd_image_tokens_free(val);
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
int32_t ret;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens;
|
||||
const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
||||
LOG_DBG("decoding text chunk, n_tokens = %zu\n", n_tokens);
|
||||
size_t i = 0;
|
||||
while (i < n_tokens) { // split into batches
|
||||
text_batch.n_tokens = 0; // clear the batch
|
||||
for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
|
||||
text_batch.n_tokens++;
|
||||
text_batch.token [i] = tokens[i];
|
||||
text_batch.pos [i] = n_past++;
|
||||
text_batch.n_seq_id[i] = 1;
|
||||
text_batch.seq_id [i][0] = seq_id;
|
||||
text_batch.logits [i] = false;
|
||||
}
|
||||
bool is_last_token = (i == n_tokens);
|
||||
if (logits_last && is_last_token) {
|
||||
text_batch.logits[text_batch.n_tokens - 1] = true;
|
||||
}
|
||||
ret = llama_decode(lctx, text_batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode text\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
*new_n_past += text_batch.n_tokens;
|
||||
}
|
||||
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
int64_t t0 = ggml_time_ms();
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("encoding image or slice...\n");
|
||||
}
|
||||
ret = mtmd_encode(ctx, image_tokens);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to encode image\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
}
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("chunk type not supported");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunks * chunks,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
size_t n_chunks = mtmd_input_chunks_size(chunks);
|
||||
if (n_chunks == 0) {
|
||||
LOG_WRN("no chunks to eval\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < n_chunks; i++) {
|
||||
bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
|
||||
int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
|
||||
if (res != 0) {
|
||||
LOG_ERR("failed to eval chunk %zu\n", i);
|
||||
return res;
|
||||
}
|
||||
*new_n_past = n_past;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
// these 2 helpers below use internal clip_image_u8_ptr,
|
||||
// so unfortunately they cannot moved to mtmd-helper.h
|
||||
// however, in theory, user can decode image file to bitmap using
|
||||
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
@@ -780,23 +506,6 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
|
||||
return mtmd_bitmap_init(nx, ny, data);
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
return ctx->use_mrope;
|
||||
}
|
||||
|
||||
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
||||
mtmd_image_tokens_free(val);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// public API functions
|
||||
//
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
#include <memory>
|
||||
|
||||
@@ -40,7 +40,6 @@ add_test "llama-mtmd-cli" "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
|
||||
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K" "vicuna"
|
||||
@@ -50,6 +49,8 @@ add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
|
||||
|
||||
# to test the big models, run: ./tests.sh big
|
||||
if [ "$RUN_BIG_TESTS" = true ]; then
|
||||
@@ -59,6 +60,8 @@ if [ "$RUN_BIG_TESTS" = true ]; then
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
||||
# add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
||||
# add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M" # too big
|
||||
fi
|
||||
@@ -70,6 +73,7 @@ fi
|
||||
|
||||
# this model has broken chat template, not usable
|
||||
# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
|
||||
# add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
|
||||
|
||||
###############
|
||||
|
||||
|
||||
@@ -7,13 +7,15 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantized models on GPU and CPU
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||
* Reranking endoint (WIP: https://github.com/ggml-org/llama.cpp/pull/9510)
|
||||
* Reranking endoint (https://github.com/ggml-org/llama.cpp/pull/9510)
|
||||
* Parallel decoding with multi-user support
|
||||
* Continuous batching
|
||||
* Multimodal (wip)
|
||||
* Multimodal ([documentation](../../docs/multimodal.md)) / with OpenAI-compatible API support
|
||||
* Monitoring endpoints
|
||||
* Schema-constrained JSON response format
|
||||
* [Function calling](../../docs/function-calling.md) / tool use for ~any model
|
||||
* Speculative decoding
|
||||
* Easy-to-use web UI
|
||||
|
||||
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggml-org/llama.cpp/issues/4216).
|
||||
|
||||
@@ -27,6 +29,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| -------- | ----------- |
|
||||
| `-h, --help, --usage` | print usage and exit |
|
||||
| `--version` | show version and build info |
|
||||
| `--completion-bash` | print source-able bash completion script for llama.cpp |
|
||||
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
|
||||
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
||||
@@ -41,7 +44,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||
| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||
@@ -69,6 +72,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
||||
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
|
||||
| `--list-devices` | print list of available devices and exit |
|
||||
| `--override-tensor, -ot <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
|
||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
|
||||
@@ -82,15 +86,18 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||
| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
|
||||
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
|
||||
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
|
||||
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
|
||||
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
|
||||
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
|
||||
| `--log-disable` | Log disable |
|
||||
| `--log-file FNAME` | Log to file |
|
||||
| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
|
||||
| `--log-prefix` | Enable prefx in log messages<br/>(env: LLAMA_LOG_PREFIX) |
|
||||
| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
|
||||
| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
|
||||
|
||||
|
||||
@@ -98,9 +105,9 @@ The project is under active development, and we are [looking for feedback and co
|
||||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: dry;top_k;typ_p;top_p;min_p;xtc;temperature) |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
|
||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
|
||||
| `--sampling-seq, --sampler-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--temp N` | temperature (default: 0.8) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
|
||||
@@ -127,22 +134,26 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
|
||||
| `--grammar-file FNAME` | file to read grammar from |
|
||||
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
|
||||
| `--jinja` | Enable experimental Jinja templating engine (required for tool use) |
|
||||
| `--reasoning-format FORMAT` | Controls extraction of model thinking traces and the format / field in which they are returned (default: `deepseek`; allowed values: `deepseek`, `none`; requires `--jinja`). `none` will leave thinking traces inline in `message.content` in a model-specific format, while `deepseek` will return them separately under `message.reasoning_content` |
|
||||
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
|
||||
|
||||
|
||||
**Example-specific params**
|
||||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||
| `--no-context-shift` | disables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||
| `--no-warmup` | skip warming up the model with an empty run |
|
||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||
| `--mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
|
||||
| `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||
| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
|
||||
| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
|
||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||
| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||
| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
|
||||
@@ -160,16 +171,29 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
||||
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||
| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none)<br/>controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).<br/>only supported for non-streamed responses<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
|
||||
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
|
||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
||||
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
|
||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
||||
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
|
||||
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
|
||||
| `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
|
||||
| `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
|
||||
| `--embd-bge-small-en-default` | use default bge-small-en-v1.5 model (note: can download weights from the internet) |
|
||||
| `--embd-e5-small-en-default` | use default e5-small-v2 model (note: can download weights from the internet) |
|
||||
| `--embd-gte-small-default` | use default gte-small model (note: can download weights from the internet) |
|
||||
| `--fim-qwen-1.5b-default` | use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet) |
|
||||
| `--fim-qwen-3b-default` | use default Qwen 2.5 Coder 3B (note: can download weights from the internet) |
|
||||
| `--fim-qwen-7b-default` | use default Qwen 2.5 Coder 7B (note: can download weights from the internet) |
|
||||
| `--fim-qwen-7b-spec` | use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet) |
|
||||
| `--fim-qwen-14b-spec` | use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet) |
|
||||
|
||||
|
||||
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
||||
|
||||
Reference in New Issue
Block a user