nits 2

nits
fix llava-uhd case
2026-06-09 20:42:57 +02:00 · 2026-06-06 18:25:05 +02:00 · 2026-06-06 18:23:06 +02:00 · 2026-06-06 18:20:01 +02:00 · 2026-06-06 17:44:23 +02:00 · 2026-06-06 13:38:12 +02:00
10 changed files with 197 additions and 74 deletions
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -37,6 +37,9 @@ struct clip_graph {
    float kq_scale; // TODO: maybe move this to hparams
    const clip_flash_attn_type flash_attn_type;

+    // TODO [QWEN_VIDEO]: improve this in the future
+    int n_batch = 1;
+
    ggml_context_ptr ctx0_ptr;
    ggml_context * ctx0;
    ggml_cgraph * gf;
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -480,10 +480,6 @@ struct clip_image_u8 {
        buf[idx + 2] = rgb[2];
    }

-    size_t n_pixels() const {
-        return (size_t) nx * (size_t) ny;
-    }
-
    size_t n_elements() const {
        return n_pixels() * 3;
    }
@@ -492,10 +488,16 @@ struct clip_image_u8 {
    std::vector<uint8_t> buf;
    int nx = 0;
    int ny = 0;
+
+    size_t n_pixels() const {
+        return (size_t) nx * (size_t) ny;
+    }
 };

 // For images, buf.size() == nx*ny*3
 //     Memory layout: RGBRGBRGB...
+// For seq, buf.size() == nx*ny*3*nt
+//     Memory layout: RGBRGB...RGBRGB... (nt times)
 // For audio, only one channel is used, buf.size() == nx*ny
 //     nx will be n_frames and ny will be n_mel
 struct clip_image_f32 {
@@ -544,10 +546,6 @@ struct clip_image_f32 {
        }
    }

-    size_t n_pixels() const {
-        return (size_t) nx_ * (size_t) ny_;
-    }
-
    size_t n_elements() const {
        return n_pixels() * 3;
    }
@@ -580,6 +578,10 @@ struct clip_image_f32 {
    std::vector<float> buf;
    int nx_ = 0;
    int ny_ = 0;
+
+    size_t n_pixels() const {
+        return (size_t) nx_ * (size_t) ny_;
+    }
 };

 //
@@ -627,6 +629,7 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
    va_end(args);
 }

+#define LOG_TRC(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
 #define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -527,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
 }

 ggml_tensor * clip_graph::build_inp_raw(int channels) {
-    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels);
+    ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels, n_batch);
    ggml_set_name(inp_raw, "inp_raw");
    ggml_set_input(inp_raw);
    return inp_raw;
@@ -848,8 +848,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
 }

 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
-    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
-
    const clip_image_f32 & img = *imgs.entries[0];
    std::unique_ptr<clip_graph> builder;

@@ -1009,6 +1007,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            GGML_ABORT("missing cgraph builder");
    }

+    // TODO [QWEN_VIDEO]: improve this in the future
+    builder->n_batch = imgs.entries.size();
+
    return builder->build();
 }

@@ -3479,12 +3480,15 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3

 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
    const clip_image_f32_batch & imgs = *imgs_c_ptr;
-    int batch_size = imgs.entries.size();
+    int n_batch_cur = imgs.entries.size();
+
+    // maximum supported batch size, usually == 2 for qwen-vl-based models
+    int n_batch_max = clip_model_n_batch_max(ctx);

    // TODO @ngxson : implement batch size > 1 as a loop
    //                we don't need true batching support because the cgraph will gonna be big anyway
-    if (batch_size != 1) {
-        return false; // only support batch size of 1
+    if (n_batch_cur > n_batch_max) {
+        return false;
    }

    // if buffers are not allocated, we need to do a warmup run to allocate them
@@ -3555,18 +3559,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        // └─────┘ │
        //   ──────┘ x B

-        for (size_t i = 0; i < imgs.entries.size(); i++) {
-            const int nx = imgs.entries[i]->nx();
-            const int ny = imgs.entries[i]->ny();
-            const int n = nx * ny;
+        // IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models
+        // All entries must have the same spatial size (enforced by can_batch_with() during merging)
+        {
+            const int nx = imgs.entries[0]->nx();
+            const int ny = imgs.entries[0]->ny();
+            const int n  = nx * ny;

-            for (int b = 0; b < batch_size; b++) {
+            for (int b = 0; b < n_batch_cur; b++) {
                const auto & buf = imgs.entries[b]->get_ro_buf();
                float * batch_entry = inp_raw.data() + b * (3*n);
                for (int y = 0; y < ny; y++) {
                    for (int x = 0; x < nx; x++) {
-                        size_t base_src = 3*(y * nx + x); // idx of the first channel
-                        size_t base_dst =    y * nx + x;  // idx of the first channel
+                        size_t base_src = 3*(y * nx + x);
+                        size_t base_dst =    y * nx + x;
                        batch_entry[      base_dst] = buf[base_src    ];
                        batch_entry[1*n + base_dst] = buf[base_src + 1];
                        batch_entry[2*n + base_dst] = buf[base_src + 2];
@@ -4549,6 +4555,17 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_AUDIO;
 }

+int clip_model_n_batch_max(const struct clip_ctx * ctx) {
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+            return 2;
+        default:
+            return 1;
+    }
+}
+
 //
 // API used internally with mtmd
 //
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -20,6 +20,12 @@ struct clip_image_size {
    bool operator==(const clip_image_size & other) const {
        return width == other.width && height == other.height;
    }
+    bool operator!=(const clip_image_size & other) const {
+        return !(*this == other);
+    }
+    int area() const {
+        return width * height;
+    }
 };

 struct clip_image_f32;
@@ -101,6 +107,8 @@ bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);

+int clip_model_n_batch_max(const struct clip_ctx * ctx);
+
 std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);

 struct clip_cap {
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -31,10 +31,11 @@ struct clip_graph_pixtral : clip_graph {
 struct clip_graph_qwen2vl : clip_graph {
    clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
+    ggml_tensor * build_inp_with_temporal_merge();
 };

-struct clip_graph_qwen3vl : clip_graph {
-    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+struct clip_graph_qwen3vl : clip_graph_qwen2vl {
+    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_qwen2vl(ctx, img) {}
    ggml_cgraph * build() override;
 };

--- a/tools/mtmd/models/qwen2vl.cpp
+++ b/tools/mtmd/models/qwen2vl.cpp
@@ -1,5 +1,34 @@
 #include "models.h"

+ggml_tensor * clip_graph_qwen2vl::build_inp_with_temporal_merge() {
+    ggml_tensor * inp_raw = build_inp_raw();
+
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
+
+    const size_t nb1 = ggml_row_size(inp_raw->type, img.nx());
+    const size_t nb2 = ggml_row_size(inp_raw->type, img.nx() * img.ny());
+
+    if (n_batch == 1) {
+        // still image input
+        return ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
+    } else if (n_batch == 2) {
+        // 2 frames input (video input)
+        ggml_tensor * inp_0 = ggml_view_3d(ctx0, inp_raw,
+                                    img.nx(), img.ny(), 3, nb1, nb2, 0);
+        ggml_tensor * inp_1 = ggml_view_3d(ctx0, inp_raw,
+                                    img.nx(), img.ny(), 3, nb1, nb2,
+                                    nb2 * 3); // move to the second frame
+        return ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_0, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_1, patch_size, patch_size, 0, 0, 1, 1));
+    } else {
+        GGML_ASSERT(false && "n_batch > 2 is not supported");
+    }
+}
+
 ggml_cgraph * clip_graph_qwen2vl::build() {
    GGML_ASSERT(model.patch_bias == nullptr);
    GGML_ASSERT(model.class_embedding == nullptr);
@@ -16,17 +45,10 @@ ggml_cgraph * clip_graph_qwen2vl::build() {

    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

-    ggml_tensor * inp_raw = build_inp_raw();
-    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
+    ggml_tensor * inp = build_inp_with_temporal_merge();

    // second conv dimension
    {
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
-
        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
        inp = ggml_cont_4d(
            ctx0, inp,
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@@ -13,17 +13,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() {

    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

-    ggml_tensor * inp_raw = build_inp_raw();
-    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    ggml_tensor * inp = build_inp_with_temporal_merge();

-    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
-
-    // second conv dimension
+    // spatial merge
    {
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
-
        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
        inp = ggml_cont_4d(
            ctx0, inp,
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -1116,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
    static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
    // TODO: support 512 (tiny) and 640 (small) once we have eval data for them

-    const int64_t orig_area = static_cast<int64_t>(img.n_pixels());
+    const int64_t orig_area = static_cast<int64_t>(img.get_size().area());

    size_t  mode_i   = 0;
    int64_t min_diff = std::numeric_limits<int64_t>::max();
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -24,10 +24,11 @@
 #include <climits>
 #include <vector>

-// represents raw image data, layout is RGBRGBRGB...
-// length of data must be nx * ny * 3
+// for still image data, layout is RGBRGBRGB...
+// length of data must be nx * ny * 3 bytes
+//
 // for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ...
-// length of data must be nx * sizeof(float)
+// length of data must be nx * sizeof(float) bytes
 struct mtmd_bitmap {
    uint32_t nx = 0;
    uint32_t ny = 0;
@@ -35,7 +36,7 @@ struct mtmd_bitmap {
    bool is_audio = false; // true if the bitmap is audio

    mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
-        : nx(nx), ny(ny) {
+        : nx(nx), ny(ny), is_audio(false) {
        if (data) {
            size_t data_size = (size_t)nx * ny * 3;
            this->data.resize(data_size);
@@ -64,6 +65,11 @@ struct mtmd_bitmap {
        return data.size();
    }

+    bool can_batch_with(const mtmd_bitmap & other) const {
+        // [QWEN_VIDEO] can batch if both are images with same size
+        return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny;
+    }
+
  private:
    std::vector<unsigned char> data;
 };
@@ -750,16 +756,55 @@ struct mtmd_tokenizer {
        cur.entries.clear();
        std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
        size_t i_bm = 0; // index of the current bitmap
+
+        // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
+        int n_merge_frames = 1;
+        if (ctx->ctx_v) {
+            n_merge_frames = clip_model_n_batch_max(ctx->ctx_v);
+            GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
+        }
+
+        std::vector<std::vector<const mtmd_bitmap *>> merged_bitmaps;
+        if (n_merge_frames > 1) {
+            size_t i_bm_scan = 0;
+            for (size_t i = 0; i < parts.size(); ++i) {
+                if (parts[i] != ctx->media_marker) {
+                    continue;
+                }
+                if (i + 1 < parts.size()
+                        && parts[i + 1] == ctx->media_marker
+                        && i_bm_scan + 1 < bitmaps.size()) {
+                    const mtmd_bitmap * bm_a = bitmaps[i_bm_scan];
+                    const mtmd_bitmap * bm_b = bitmaps[i_bm_scan + 1];
+                    if (bm_a->can_batch_with(*bm_b)) {
+                        LOG_DBG("%s: merging 2 frames at bitmap index %zu and %zu\n", __func__, i_bm_scan, i_bm_scan + 1);
+                        merged_bitmaps.push_back({bm_a, bm_b});
+                        parts.erase(parts.begin() + i + 1); // remove the second marker
+                        i_bm_scan += 2;
+                        continue;
+                    }
+                }
+                LOG_DBG("%s: no merging for bitmap index %zu\n", __func__, i_bm_scan);
+                merged_bitmaps.push_back({bitmaps[i_bm_scan]});
+                ++i_bm_scan;
+            }
+        } else {
+            for (size_t i = 0; i < bitmaps.size(); ++i) {
+                merged_bitmaps.push_back({bitmaps[i]});
+            }
+        }
+
+        i_bm = 0;
        for (auto & part : parts) {
            if (part == ctx->media_marker) {
                // this is a marker, we should add the next bitmap
-                if (i_bm >= bitmaps.size()) {
+                if (i_bm >= merged_bitmaps.size()) {
                    LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
-                            __func__, bitmaps.size(), parts.size() - 1);
+                            __func__, merged_bitmaps.size(), parts.size() - 1);
                    return 1;
                }
-                const mtmd_bitmap * bitmap = bitmaps[i_bm++];
-                int32_t res = add_media(bitmap);
+                auto & bmps = merged_bitmaps[i_bm++];
+                int32_t res = add_media(bmps);
                if (res != 0) {
                    return res;
                }
@@ -794,9 +839,9 @@ struct mtmd_tokenizer {
            }
        }

-        if (i_bm != bitmaps.size()) {
+        if (i_bm != merged_bitmaps.size()) {
            LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
-                    __func__, bitmaps.size(), parts.size() - 1);
+                    __func__, merged_bitmaps.size(), parts.size() - 1);
            return 1;
        }

@@ -835,8 +880,10 @@ struct mtmd_tokenizer {
        }
    }

-    int32_t add_media(const mtmd_bitmap * bitmap) {
-        if (!bitmap->is_audio) {
+    int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
+        GGML_ASSERT(!bitmaps.empty());
+
+        if (!bitmaps[0]->is_audio) {
            // handle image

            if (!ctx->ctx_v) {
@@ -848,27 +895,44 @@ struct mtmd_tokenizer {
                add_text(ctx->img_beg, true); // add image begin token
            }

-            // sanity check
-            if (bitmap->nx <= 0 || bitmap->ny <= 0) {
-                LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
-                        __func__, bitmap->nx, bitmap->ny);
-                return 2;
-            }
-            GGML_ASSERT(ctx->image_preproc != nullptr);
+            // TODO @ngxson : this is quite hacky because preprocessor only support batch with one single element, that need to be fixed in the future (e.g. by changing the preprocessor interface always take single input)

-            // convert mtmd_bitmap to clip_image_u8
-            clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->set_size(
-                {(int)bitmap->nx, (int)bitmap->ny},
-                bitmap->is_placeholder());
-            img_u8->cpy_buf(bitmap->get_ro_buf());
-
-            // preprocess image
            clip_image_f32_batch batch_f32;
-            bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
-            if (!ok) {
-                LOG_ERR("Unable to preprocess image\n");
-                return 2;
+
+            for (const auto * bmp : bitmaps) {
+                // sanity check
+                GGML_ASSERT(!bmp->is_audio);
+                GGML_ASSERT(ctx->image_preproc != nullptr);
+                if (bmp->nx <= 0 || bmp->ny <= 0) {
+                    LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
+                            __func__, bmp->nx, bmp->ny);
+                    return 2;
+                }
+
+                // convert mtmd_bitmap to clip_image_u8
+                clip_image_u8_ptr img_u8(clip_image_u8_init());
+                img_u8->set_size(
+                    {(int)bmp->nx, (int)bmp->ny},
+                    bmp->is_placeholder());
+                img_u8->cpy_buf(bmp->get_ro_buf());
+
+                // preprocess image
+                clip_image_f32_batch tmp_batch;
+                bool ok = ctx->image_preproc->preprocess(*img_u8, tmp_batch);
+                if (!ok) {
+                    LOG_ERR("Unable to preprocess image\n");
+                    return 2;
+                }
+
+                // move entries and grid dimensions to the "global" batch_f32
+                for (auto & entry : tmp_batch.entries) {
+                    batch_f32.entries.emplace_back(std::move(entry));
+                }
+
+                // for llava-uhd style, we need to handle grid too
+                // we don't care about overwriting these values for now because llama-uhd doesn't support batching anyway
+                batch_f32.grid_x = tmp_batch.grid_x;
+                batch_f32.grid_y = tmp_batch.grid_y;
            }

            // Annotate llava-next style tiles so clip_n_output_tokens accounts
@@ -896,11 +960,14 @@ struct mtmd_tokenizer {
                || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
                || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
            ) {
+                // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
+                GGML_ASSERT(bitmaps.size() == 1);
+
                const int n_col = batch_f32.grid_x;
                const int n_row = batch_f32.grid_y;
                // split batch into chunks of single images
                // NOTE: batch_f32 will be invalidated after this call
-                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
+                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[0]->id);
                GGML_ASSERT(chunks.size() > 0);

                auto ov_chunk = std::move(chunks.front());
@@ -954,6 +1021,10 @@ struct mtmd_tokenizer {
                size_t n_tokens = 0;
                for (const auto & e : batch_f32.entries) {
                    n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
+                    if (clip_model_n_batch_max(ctx->ctx_v) == 2) {
+                        // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
+                        break;
+                    }
                }

                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
@@ -976,7 +1047,7 @@ struct mtmd_tokenizer {
                    GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
                }
                image_tokens->batch_f32 = std::move(batch_f32);
-                image_tokens->id = bitmap->id; // optional
+                image_tokens->id = bitmaps[0]->id; // optional

                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@@ -1001,6 +1072,9 @@ struct mtmd_tokenizer {
        } else {
            // handle audio

+            GGML_ASSERT(bitmaps.size() == 1); // no batching support for now
+            auto & bitmap = bitmaps[0];
+
            if (!ctx->ctx_a) {
                LOG_ERR("%s: error: model does not support audio input\n", __func__);
                return 2;
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -133,6 +133,8 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
 // if bitmap is image:
 //     length of data must be nx * ny * 3
 //     the data is in RGBRGBRGB... format
+//     note: some video-capable models (i.e. qwen-vl) can merge consecutive bitmaps
+//           into one chunk, mtmd_tokenize() will automatically handle this
 // if bitmap is audio:
 //     length of data must be n_samples * sizeof(float)
 //     the data is in float format (PCM F32)
Author	SHA1	Message	Date
Xuan Son Nguyen	b031b6062d	nits 2	2026-06-06 18:25:05 +02:00
Xuan Son Nguyen	9819ad4317	nits	2026-06-06 18:23:06 +02:00
Xuan Son Nguyen	82b48212fd	fix llava-uhd case	2026-06-06 18:20:01 +02:00
Xuan Son Nguyen	a404c4ea0b	revise the design	2026-06-06 17:44:23 +02:00
Xuan Son Nguyen	96e24ca55f	Merge branch 'master' into mtmd-video-api	2026-06-06 13:38:12 +02:00
Xuan Son Nguyen	e5b3d6d1f2	Merge branch 'master' into mtmd-video-api	2026-06-05 18:37:15 +02:00
Xuan Son Nguyen	c5b682b25c	various clean up	2026-04-13 17:39:14 +02:00
Xuan Son Nguyen	f558360b32	Merge branch 'master' into video-support	2026-04-13 15:40:05 +02:00
andrewmd5	573f2cf58e	feat: add video support for Qwen3.5	2026-03-06 21:37:07 +09:00