Compare commits

...

9 Commits

Author SHA1 Message Date
Xuan Son Nguyen
b031b6062d nits 2 2026-06-06 18:25:05 +02:00
Xuan Son Nguyen
9819ad4317 nits 2026-06-06 18:23:06 +02:00
Xuan Son Nguyen
82b48212fd fix llava-uhd case 2026-06-06 18:20:01 +02:00
Xuan Son Nguyen
a404c4ea0b revise the design 2026-06-06 17:44:23 +02:00
Xuan Son Nguyen
96e24ca55f Merge branch 'master' into mtmd-video-api 2026-06-06 13:38:12 +02:00
Xuan Son Nguyen
e5b3d6d1f2 Merge branch 'master' into mtmd-video-api 2026-06-05 18:37:15 +02:00
Xuan Son Nguyen
c5b682b25c various clean up 2026-04-13 17:39:14 +02:00
Xuan Son Nguyen
f558360b32 Merge branch 'master' into video-support 2026-04-13 15:40:05 +02:00
andrewmd5
573f2cf58e feat: add video support for Qwen3.5 2026-03-06 21:37:07 +09:00
10 changed files with 197 additions and 74 deletions

View File

@@ -37,6 +37,9 @@ struct clip_graph {
float kq_scale; // TODO: maybe move this to hparams
const clip_flash_attn_type flash_attn_type;
// TODO [QWEN_VIDEO]: improve this in the future
int n_batch = 1;
ggml_context_ptr ctx0_ptr;
ggml_context * ctx0;
ggml_cgraph * gf;

View File

@@ -480,10 +480,6 @@ struct clip_image_u8 {
buf[idx + 2] = rgb[2];
}
size_t n_pixels() const {
return (size_t) nx * (size_t) ny;
}
size_t n_elements() const {
return n_pixels() * 3;
}
@@ -492,10 +488,16 @@ struct clip_image_u8 {
std::vector<uint8_t> buf;
int nx = 0;
int ny = 0;
size_t n_pixels() const {
return (size_t) nx * (size_t) ny;
}
};
// For images, buf.size() == nx*ny*3
// Memory layout: RGBRGBRGB...
// For seq, buf.size() == nx*ny*3*nt
// Memory layout: RGBRGB...RGBRGB... (nt times)
// For audio, only one channel is used, buf.size() == nx*ny
// nx will be n_frames and ny will be n_mel
struct clip_image_f32 {
@@ -544,10 +546,6 @@ struct clip_image_f32 {
}
}
size_t n_pixels() const {
return (size_t) nx_ * (size_t) ny_;
}
size_t n_elements() const {
return n_pixels() * 3;
}
@@ -580,6 +578,10 @@ struct clip_image_f32 {
std::vector<float> buf;
int nx_ = 0;
int ny_ = 0;
size_t n_pixels() const {
return (size_t) nx_ * (size_t) ny_;
}
};
//
@@ -627,6 +629,7 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
va_end(args);
}
#define LOG_TRC(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__)

View File

@@ -527,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
}
ggml_tensor * clip_graph::build_inp_raw(int channels) {
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels);
ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels, n_batch);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
return inp_raw;
@@ -848,8 +848,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
}
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
const clip_image_f32 & img = *imgs.entries[0];
std::unique_ptr<clip_graph> builder;
@@ -1009,6 +1007,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
GGML_ABORT("missing cgraph builder");
}
// TODO [QWEN_VIDEO]: improve this in the future
builder->n_batch = imgs.entries.size();
return builder->build();
}
@@ -3479,12 +3480,15 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
const clip_image_f32_batch & imgs = *imgs_c_ptr;
int batch_size = imgs.entries.size();
int n_batch_cur = imgs.entries.size();
// maximum supported batch size, usually == 2 for qwen-vl-based models
int n_batch_max = clip_model_n_batch_max(ctx);
// TODO @ngxson : implement batch size > 1 as a loop
// we don't need true batching support because the cgraph will gonna be big anyway
if (batch_size != 1) {
return false; // only support batch size of 1
if (n_batch_cur > n_batch_max) {
return false;
}
// if buffers are not allocated, we need to do a warmup run to allocate them
@@ -3555,18 +3559,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// └─────┘ │
// ──────┘ x B
for (size_t i = 0; i < imgs.entries.size(); i++) {
const int nx = imgs.entries[i]->nx();
const int ny = imgs.entries[i]->ny();
const int n = nx * ny;
// IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models
// All entries must have the same spatial size (enforced by can_batch_with() during merging)
{
const int nx = imgs.entries[0]->nx();
const int ny = imgs.entries[0]->ny();
const int n = nx * ny;
for (int b = 0; b < batch_size; b++) {
for (int b = 0; b < n_batch_cur; b++) {
const auto & buf = imgs.entries[b]->get_ro_buf();
float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
size_t base_src = 3*(y * nx + x); // idx of the first channel
size_t base_dst = y * nx + x; // idx of the first channel
size_t base_src = 3*(y * nx + x);
size_t base_dst = y * nx + x;
batch_entry[ base_dst] = buf[base_src ];
batch_entry[1*n + base_dst] = buf[base_src + 1];
batch_entry[2*n + base_dst] = buf[base_src + 2];
@@ -4549,6 +4555,17 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_AUDIO;
}
int clip_model_n_batch_max(const struct clip_ctx * ctx) {
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
return 2;
default:
return 1;
}
}
//
// API used internally with mtmd
//

View File

@@ -20,6 +20,12 @@ struct clip_image_size {
bool operator==(const clip_image_size & other) const {
return width == other.width && height == other.height;
}
bool operator!=(const clip_image_size & other) const {
return !(*this == other);
}
int area() const {
return width * height;
}
};
struct clip_image_f32;
@@ -101,6 +107,8 @@ bool clip_is_llava(const struct clip_ctx * ctx);
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
int clip_model_n_batch_max(const struct clip_ctx * ctx);
std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
struct clip_cap {

View File

@@ -31,10 +31,11 @@ struct clip_graph_pixtral : clip_graph {
struct clip_graph_qwen2vl : clip_graph {
clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * build_inp_with_temporal_merge();
};
struct clip_graph_qwen3vl : clip_graph {
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
struct clip_graph_qwen3vl : clip_graph_qwen2vl {
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_qwen2vl(ctx, img) {}
ggml_cgraph * build() override;
};

View File

@@ -1,5 +1,34 @@
#include "models.h"
ggml_tensor * clip_graph_qwen2vl::build_inp_with_temporal_merge() {
ggml_tensor * inp_raw = build_inp_raw();
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
const size_t nb1 = ggml_row_size(inp_raw->type, img.nx());
const size_t nb2 = ggml_row_size(inp_raw->type, img.nx() * img.ny());
if (n_batch == 1) {
// still image input
return ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
} else if (n_batch == 2) {
// 2 frames input (video input)
ggml_tensor * inp_0 = ggml_view_3d(ctx0, inp_raw,
img.nx(), img.ny(), 3, nb1, nb2, 0);
ggml_tensor * inp_1 = ggml_view_3d(ctx0, inp_raw,
img.nx(), img.ny(), 3, nb1, nb2,
nb2 * 3); // move to the second frame
return ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_0, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_1, patch_size, patch_size, 0, 0, 1, 1));
} else {
GGML_ASSERT(false && "n_batch > 2 is not supported");
}
}
ggml_cgraph * clip_graph_qwen2vl::build() {
GGML_ASSERT(model.patch_bias == nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
@@ -16,17 +45,10 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
ggml_tensor * inp = build_inp_with_temporal_merge();
// second conv dimension
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,

View File

@@ -13,17 +13,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
ggml_tensor * inp = build_inp_with_temporal_merge();
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
// second conv dimension
// spatial merge
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,

View File

@@ -1116,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
// TODO: support 512 (tiny) and 640 (small) once we have eval data for them
const int64_t orig_area = static_cast<int64_t>(img.n_pixels());
const int64_t orig_area = static_cast<int64_t>(img.get_size().area());
size_t mode_i = 0;
int64_t min_diff = std::numeric_limits<int64_t>::max();

View File

@@ -24,10 +24,11 @@
#include <climits>
#include <vector>
// represents raw image data, layout is RGBRGBRGB...
// length of data must be nx * ny * 3
// for still image data, layout is RGBRGBRGB...
// length of data must be nx * ny * 3 bytes
//
// for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ...
// length of data must be nx * sizeof(float)
// length of data must be nx * sizeof(float) bytes
struct mtmd_bitmap {
uint32_t nx = 0;
uint32_t ny = 0;
@@ -35,7 +36,7 @@ struct mtmd_bitmap {
bool is_audio = false; // true if the bitmap is audio
mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
: nx(nx), ny(ny) {
: nx(nx), ny(ny), is_audio(false) {
if (data) {
size_t data_size = (size_t)nx * ny * 3;
this->data.resize(data_size);
@@ -64,6 +65,11 @@ struct mtmd_bitmap {
return data.size();
}
bool can_batch_with(const mtmd_bitmap & other) const {
// [QWEN_VIDEO] can batch if both are images with same size
return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny;
}
private:
std::vector<unsigned char> data;
};
@@ -750,16 +756,55 @@ struct mtmd_tokenizer {
cur.entries.clear();
std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
size_t i_bm = 0; // index of the current bitmap
// [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
int n_merge_frames = 1;
if (ctx->ctx_v) {
n_merge_frames = clip_model_n_batch_max(ctx->ctx_v);
GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
}
std::vector<std::vector<const mtmd_bitmap *>> merged_bitmaps;
if (n_merge_frames > 1) {
size_t i_bm_scan = 0;
for (size_t i = 0; i < parts.size(); ++i) {
if (parts[i] != ctx->media_marker) {
continue;
}
if (i + 1 < parts.size()
&& parts[i + 1] == ctx->media_marker
&& i_bm_scan + 1 < bitmaps.size()) {
const mtmd_bitmap * bm_a = bitmaps[i_bm_scan];
const mtmd_bitmap * bm_b = bitmaps[i_bm_scan + 1];
if (bm_a->can_batch_with(*bm_b)) {
LOG_DBG("%s: merging 2 frames at bitmap index %zu and %zu\n", __func__, i_bm_scan, i_bm_scan + 1);
merged_bitmaps.push_back({bm_a, bm_b});
parts.erase(parts.begin() + i + 1); // remove the second marker
i_bm_scan += 2;
continue;
}
}
LOG_DBG("%s: no merging for bitmap index %zu\n", __func__, i_bm_scan);
merged_bitmaps.push_back({bitmaps[i_bm_scan]});
++i_bm_scan;
}
} else {
for (size_t i = 0; i < bitmaps.size(); ++i) {
merged_bitmaps.push_back({bitmaps[i]});
}
}
i_bm = 0;
for (auto & part : parts) {
if (part == ctx->media_marker) {
// this is a marker, we should add the next bitmap
if (i_bm >= bitmaps.size()) {
if (i_bm >= merged_bitmaps.size()) {
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
__func__, bitmaps.size(), parts.size() - 1);
__func__, merged_bitmaps.size(), parts.size() - 1);
return 1;
}
const mtmd_bitmap * bitmap = bitmaps[i_bm++];
int32_t res = add_media(bitmap);
auto & bmps = merged_bitmaps[i_bm++];
int32_t res = add_media(bmps);
if (res != 0) {
return res;
}
@@ -794,9 +839,9 @@ struct mtmd_tokenizer {
}
}
if (i_bm != bitmaps.size()) {
if (i_bm != merged_bitmaps.size()) {
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
__func__, bitmaps.size(), parts.size() - 1);
__func__, merged_bitmaps.size(), parts.size() - 1);
return 1;
}
@@ -835,8 +880,10 @@ struct mtmd_tokenizer {
}
}
int32_t add_media(const mtmd_bitmap * bitmap) {
if (!bitmap->is_audio) {
int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
GGML_ASSERT(!bitmaps.empty());
if (!bitmaps[0]->is_audio) {
// handle image
if (!ctx->ctx_v) {
@@ -848,27 +895,44 @@ struct mtmd_tokenizer {
add_text(ctx->img_beg, true); // add image begin token
}
// sanity check
if (bitmap->nx <= 0 || bitmap->ny <= 0) {
LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
__func__, bitmap->nx, bitmap->ny);
return 2;
}
GGML_ASSERT(ctx->image_preproc != nullptr);
// TODO @ngxson : this is quite hacky because preprocessor only support batch with one single element, that need to be fixed in the future (e.g. by changing the preprocessor interface always take single input)
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->set_size(
{(int)bitmap->nx, (int)bitmap->ny},
bitmap->is_placeholder());
img_u8->cpy_buf(bitmap->get_ro_buf());
// preprocess image
clip_image_f32_batch batch_f32;
bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
return 2;
for (const auto * bmp : bitmaps) {
// sanity check
GGML_ASSERT(!bmp->is_audio);
GGML_ASSERT(ctx->image_preproc != nullptr);
if (bmp->nx <= 0 || bmp->ny <= 0) {
LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
__func__, bmp->nx, bmp->ny);
return 2;
}
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->set_size(
{(int)bmp->nx, (int)bmp->ny},
bmp->is_placeholder());
img_u8->cpy_buf(bmp->get_ro_buf());
// preprocess image
clip_image_f32_batch tmp_batch;
bool ok = ctx->image_preproc->preprocess(*img_u8, tmp_batch);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
return 2;
}
// move entries and grid dimensions to the "global" batch_f32
for (auto & entry : tmp_batch.entries) {
batch_f32.entries.emplace_back(std::move(entry));
}
// for llava-uhd style, we need to handle grid too
// we don't care about overwriting these values for now because llama-uhd doesn't support batching anyway
batch_f32.grid_x = tmp_batch.grid_x;
batch_f32.grid_y = tmp_batch.grid_y;
}
// Annotate llava-next style tiles so clip_n_output_tokens accounts
@@ -896,11 +960,14 @@ struct mtmd_tokenizer {
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
) {
// [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
GGML_ASSERT(bitmaps.size() == 1);
const int n_col = batch_f32.grid_x;
const int n_row = batch_f32.grid_y;
// split batch into chunks of single images
// NOTE: batch_f32 will be invalidated after this call
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[0]->id);
GGML_ASSERT(chunks.size() > 0);
auto ov_chunk = std::move(chunks.front());
@@ -954,6 +1021,10 @@ struct mtmd_tokenizer {
size_t n_tokens = 0;
for (const auto & e : batch_f32.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
if (clip_model_n_batch_max(ctx->ctx_v) == 2) {
// [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
break;
}
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
@@ -976,7 +1047,7 @@ struct mtmd_tokenizer {
GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmap->id; // optional
image_tokens->id = bitmaps[0]->id; // optional
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@@ -1001,6 +1072,9 @@ struct mtmd_tokenizer {
} else {
// handle audio
GGML_ASSERT(bitmaps.size() == 1); // no batching support for now
auto & bitmap = bitmaps[0];
if (!ctx->ctx_a) {
LOG_ERR("%s: error: model does not support audio input\n", __func__);
return 2;

View File

@@ -133,6 +133,8 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
// if bitmap is image:
// length of data must be nx * ny * 3
// the data is in RGBRGBRGB... format
// note: some video-capable models (i.e. qwen-vl) can merge consecutive bitmaps
// into one chunk, mtmd_tokenize() will automatically handle this
// if bitmap is audio:
// length of data must be n_samples * sizeof(float)
// the data is in float format (PCM F32)