summaryrefslogtreecommitdiff
path: root/llama.cpp/tools/mtmd/mtmd.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/tools/mtmd/mtmd.cpp')
-rw-r--r--llama.cpp/tools/mtmd/mtmd.cpp1151
1 files changed, 1151 insertions, 0 deletions
diff --git a/llama.cpp/tools/mtmd/mtmd.cpp b/llama.cpp/tools/mtmd/mtmd.cpp
new file mode 100644
index 0000000..b763627
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd.cpp
@@ -0,0 +1,1151 @@
+#include "clip.h"
+#include "clip-impl.h"
+#include "mtmd.h"
+#include "mtmd-audio.h"
+
+#include "llama.h"
+
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+// represents raw image data, layout is RGBRGBRGB...
+// length of data must be nx * ny * 3
+struct mtmd_bitmap {
+ uint32_t nx;
+ uint32_t ny;
+ std::vector<unsigned char> data;
+ std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
+ bool is_audio = false; // true if the bitmap is audio
+};
+
+struct mtmd_image_tokens {
+ uint32_t nx; // number of tokens in x direction
+ uint32_t ny; // number of tokens in y direction
+ bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
+ uint32_t n_tokens() const { return nx * ny; }
+ clip_image_f32_batch batch_f32; // preprocessed image patches
+ std::string id; // optional user-defined ID, useful for KV cache tracking
+
+ mtmd_image_tokens clone() {
+ return mtmd_image_tokens{
+ nx,
+ ny,
+ use_mrope_pos,
+ batch_f32.clone(),
+ id
+ };
+ }
+};
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
+
+struct mtmd_audio_tokens {
+ uint32_t n_tokens; // number of tokens
+ clip_image_f32_batch batch_f32; // preprocessed image patches
+ std::string id; // optional user-defined ID, useful for KV cache tracking
+
+ mtmd_audio_tokens clone() {
+ return mtmd_audio_tokens{
+ n_tokens,
+ batch_f32.clone(),
+ id
+ };
+ }
+};
+using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
+
+struct mtmd_input_chunk {
+ mtmd_input_chunk_type type;
+ std::vector<llama_token> tokens_text;
+ mtmd_image_tokens_ptr tokens_image;
+ mtmd_audio_tokens_ptr tokens_audio;
+};
+
+struct mtmd_input_chunks {
+ std::vector<mtmd_input_chunk> entries;
+};
+
+// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
+// models not having it (llava-1.6) will process embeddings without any special tokens in-between
+enum mtmd_slice_tmpl {
+ MTMD_SLICE_TMPL_NONE,
+ MTMD_SLICE_TMPL_MINICPMV_2_5,
+ MTMD_SLICE_TMPL_MINICPMV_2_6,
+ MTMD_SLICE_TMPL_LLAMA4,
+ MTMD_SLICE_TMPL_IDEFICS3,
+ MTMD_SLICE_TMPL_LFM2,
+};
+
+const char * mtmd_default_marker() {
+ return "<__media__>";
+}
+
+static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
+ switch (flash_attn_type) {
+ case LLAMA_FLASH_ATTN_TYPE_AUTO: return CLIP_FLASH_ATTN_TYPE_AUTO;
+ case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
+ case LLAMA_FLASH_ATTN_TYPE_ENABLED: return CLIP_FLASH_ATTN_TYPE_ENABLED;
+ }
+ return CLIP_FLASH_ATTN_TYPE_AUTO;
+}
+
+mtmd_context_params mtmd_context_params_default() {
+ mtmd_context_params params {
+ /* use_gpu */ true,
+ /* print_timings */ true,
+ /* n_threads */ 4,
+ /* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
+ /* media_marker */ mtmd_default_marker(),
+ /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
+ /* warmup */ true,
+ /* image_min_tokens */ -1,
+ /* image_max_tokens */ -1,
+ /* cb_eval */ nullptr,
+ /* cb_eval_user_data */ nullptr,
+ };
+ return params;
+}
+
+struct mtmd_context {
+ struct clip_ctx * ctx_v; // vision
+ struct clip_ctx * ctx_a; // audio
+ const struct llama_model * text_model;
+ std::vector<float> image_embd_v; // image embedding vector
+
+ bool print_timings;
+ int n_threads;
+ std::string media_marker;
+ const int n_embd_text;
+
+ // these are not token, but strings used to mark the beginning and end of image/audio embeddings
+ std::string img_beg;
+ std::string img_end;
+ std::string aud_beg;
+ std::string aud_end;
+
+ // for llava-uhd style models, we need special tokens in-between slices
+ // minicpmv calls them "slices", llama 4 calls them "tiles"
+ mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
+ std::vector<llama_token> tok_ov_img_start; // overview image
+ std::vector<llama_token> tok_ov_img_end; // overview image
+ std::vector<llama_token> tok_slices_start; // start of all slices
+ std::vector<llama_token> tok_slices_end; // end of all slices
+ std::vector<llama_token> tok_sli_img_start; // single slice start
+ std::vector<llama_token> tok_sli_img_end; // single slice end
+ std::vector<llama_token> tok_sli_img_mid; // between 2 slices
+ std::vector<llama_token> tok_row_end; // end of row
+ bool tok_row_end_trail = false;
+ bool ov_img_first = false;
+
+ // string template for slice image delimiters with row/col (idefics3)
+ std::string sli_img_start_tmpl;
+
+ std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
+
+ // TODO @ngxson : add timings
+
+ mtmd_context(const char * mmproj_fname,
+ const llama_model * text_model,
+ const mtmd_context_params & ctx_params) :
+ text_model (text_model),
+ print_timings(ctx_params.print_timings),
+ n_threads (ctx_params.n_threads),
+ media_marker (ctx_params.media_marker),
+ n_embd_text (llama_model_n_embd_inp(text_model))
+ {
+ if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
+ throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
+ }
+
+ if (media_marker.empty()) {
+ throw std::runtime_error("media_marker must not be empty");
+ }
+
+ clip_context_params ctx_clip_params {
+ /* use_gpu */ ctx_params.use_gpu,
+ /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
+ /* image_min_tokens */ ctx_params.image_min_tokens,
+ /* image_max_tokens */ ctx_params.image_max_tokens,
+ /* warmup */ ctx_params.warmup,
+ /* cb_eval */ ctx_params.cb_eval,
+ /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
+ };
+
+ auto res = clip_init(mmproj_fname, ctx_clip_params);
+ ctx_v = res.ctx_v;
+ ctx_a = res.ctx_a;
+ if (!ctx_v && !ctx_a) {
+ throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
+ }
+
+ // if both vision and audio mmproj are present, we need to validate their n_embd
+ if (ctx_v && ctx_a) {
+ int n_embd_v = clip_n_mmproj_embd(ctx_v);
+ int n_embd_a = clip_n_mmproj_embd(ctx_a);
+ if (n_embd_v != n_embd_a) {
+ throw std::runtime_error(string_format(
+ "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
+ n_embd_v, n_embd_a));
+ }
+ }
+
+ // since we already validate n_embd of vision and audio mmproj,
+ // we can safely assume that they are the same
+ int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
+ if (n_embd_text != n_embd_clip) {
+ throw std::runtime_error(string_format(
+ "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
+ "hint: you may be using wrong mmproj\n",
+ n_embd_text, n_embd_clip));
+ }
+ if (ctx_v) {
+ init_vision();
+ }
+ if (ctx_a) {
+ init_audio();
+ }
+ }
+
+ void init_vision() {
+ GGML_ASSERT(ctx_v != nullptr);
+
+ projector_type proj = clip_get_projector_type(ctx_v);
+ int minicpmv_version = clip_is_minicpmv(ctx_v);
+ if (minicpmv_version == 2) {
+ // minicpmv 2.5 format:
+ // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
+ tok_ov_img_start = {lookup_token("<image>")};
+ tok_ov_img_end = {lookup_token("</image>")};
+ tok_slices_start = {lookup_token("<slice>")};
+ tok_slices_end = {lookup_token("</slice>")};
+ tok_sli_img_start = tok_ov_img_start;
+ tok_sli_img_end = tok_ov_img_end;
+ tok_row_end = {lookup_token("\n")};
+ tok_row_end_trail = false; // no trailing end-of-row token
+ ov_img_first = true;
+
+ } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
+ // minicpmv 2.6 format:
+ // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
+ tok_ov_img_start = {lookup_token("<image>")};
+ tok_ov_img_end = {lookup_token("</image>")};
+ tok_sli_img_start = {lookup_token("<slice>")};
+ tok_sli_img_end = {lookup_token("</slice>")};
+ tok_row_end = {lookup_token("\n")};
+ tok_row_end_trail = false; // no trailing end-of-row token
+ ov_img_first = true;
+
+ } else if (minicpmv_version != 0) {
+ GGML_ASSERT(false && "unsupported minicpmv version");
+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+ // llama 4 format:
+ // <|image_start|>
+ // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+ // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+ // ... <|tile_y_separator|> <-- trailing end-of-row token
+ // <|image|> (overview) <-- overview image is last
+ // <|image_end|>
+ slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
+ tok_ov_img_start = {lookup_token("<|image|>")};
+ tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
+ tok_row_end = {lookup_token("<|tile_y_separator|>")};
+ tok_row_end_trail = true; // add trailing end-of-row token
+ ov_img_first = false; // overview image is last
+ }
+
+ // set boi/eoi
+ if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
+ // <start_of_image> ... (image embeddings) ... <end_of_image>
+ img_beg = "<start_of_image>";
+ img_end = "<end_of_image>";
+
+ } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
+ // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
+ slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
+ tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
+ tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
+ tok_row_end = {lookup_token("\n")};
+ sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
+
+ } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
+ // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
+ img_end = "[IMG_END]";
+
+ } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
+ // <|vision_start|> ... (image embeddings) ... <|vision_end|>
+ img_beg = "<|vision_start|>";
+ img_end = "<|vision_end|>";
+
+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+ // (more details in mtmd_context constructor)
+ img_beg = "<|image_start|>";
+ img_end = "<|image_end|>";
+ LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
+ " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+
+ } else if (proj == PROJECTOR_TYPE_INTERNVL) {
+ // <img> ... (image embeddings) ... </img>
+ img_beg = "<img>";
+ img_end = "</img>";
+
+ } else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
+ // <|im_start|> ... (image embeddings) ... <|im_end|>
+ img_beg = "<|im_start|>";
+ img_end = "<|im_end|>";
+
+ } else if (proj == PROJECTOR_TYPE_LFM2) {
+ // multi-tile:
+ // <|image_start|>
+ // <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
+ // <|img_thumbnail|> (thumbnail)
+ // <|image_end|>
+ // single-tile:
+ // <|image_start|> (image) <|image_end|>
+ img_beg = "<|image_start|>";
+ img_end = "<|image_end|>";
+ slice_tmpl = MTMD_SLICE_TMPL_LFM2;
+ sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
+ tok_ov_img_start = {lookup_token("<|img_thumbnail|>")};
+ ov_img_first = false;
+ } else if (proj == PROJECTOR_TYPE_GLM4V) {
+ img_beg = "<|begin_of_image|>";
+ img_end = "<|end_of_image|>";
+
+ }
+ }
+
+ void init_audio() {
+ GGML_ASSERT(ctx_a != nullptr);
+ projector_type proj = clip_get_projector_type(ctx_a);
+
+ LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
+ " https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
+
+ // set preprocessor
+ switch (proj) {
+ case PROJECTOR_TYPE_QWEN2A:
+ case PROJECTOR_TYPE_QWEN25O:
+ case PROJECTOR_TYPE_ULTRAVOX:
+ case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_GLMA:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
+ break;
+ case PROJECTOR_TYPE_LFM2A:
+ audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
+ break;
+ default:
+ GGML_ABORT("unsupported audio projector type");
+ }
+
+ // initialize audio preprocessor
+ audio_preproc->initialize();
+
+ // set special tokens
+ if (proj == PROJECTOR_TYPE_QWEN2A) {
+ // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
+ aud_beg = "<|audio_bos|>";
+ aud_end = "<|audio_eos|>";
+
+ } else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
+ // [BEGIN_AUDIO] ... (embeddings) ...
+ aud_beg = "[BEGIN_AUDIO]";
+
+ } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
+ // <sound> ... (embeddings) ...
+ aud_beg = "<sound>";
+ }
+ }
+
+ // get clip ctx based on chunk type
+ clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ return ctx_v;
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ return ctx_a;
+ }
+ GGML_ABORT("unknown chunk type");
+ }
+
+ projector_type proj_type_v() const {
+ return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
+ }
+
+ projector_type proj_type_a() const {
+ return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
+ }
+
+ ~mtmd_context() {
+ clip_free(ctx_a);
+ clip_free(ctx_v);
+ }
+
+private:
+ llama_token lookup_token(const std::string & token_text) {
+ const llama_vocab * vocab = llama_model_get_vocab(text_model);
+ const int n_vocab = llama_vocab_n_tokens(vocab);
+ for (int i = 0; i < n_vocab; i++) {
+ if (token_to_piece(vocab, i, true) == token_text) {
+ return i;
+ }
+ }
+ return LLAMA_TOKEN_NULL;
+ }
+
+ std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
+ std::string piece;
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
+ const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+ if (n_chars < 0) {
+ piece.resize(-n_chars);
+ int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+ GGML_ASSERT(check == -n_chars);
+ } else {
+ piece.resize(n_chars);
+ }
+ return piece;
+ }
+};
+
+mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+ const struct llama_model * text_model,
+ const struct mtmd_context_params ctx_params) {
+ try {
+ return new mtmd_context(mmproj_fname, text_model, ctx_params);
+ } catch (const std::exception & e) {
+ LOG_ERR("%s: error: %s\n", __func__, e.what());
+ return nullptr;
+ }
+}
+
+void mtmd_free(mtmd_context * ctx) {
+ delete ctx;
+}
+
+struct mtmd_tokenizer {
+ mtmd_context * ctx;
+ std::vector<const mtmd_bitmap *> bitmaps;
+
+ std::string input_text;
+ bool add_special;
+ bool parse_special;
+ const llama_vocab * vocab;
+
+ mtmd_input_chunks cur;
+
+ mtmd_tokenizer(mtmd_context * ctx,
+ const mtmd_input_text * text,
+ const mtmd_bitmap ** bitmaps,
+ size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
+ add_special = text->add_special;
+ parse_special = text->parse_special;
+ input_text = text->text;
+ vocab = llama_model_get_vocab(ctx->text_model);
+
+ // for compatibility, we convert image marker to media marker
+ string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
+ }
+
+ int32_t tokenize(mtmd_input_chunks * output) {
+ cur.entries.clear();
+ std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
+ size_t i_bm = 0; // index of the current bitmap
+ for (auto & part : parts) {
+ if (part == ctx->media_marker) {
+ // this is a marker, we should add the next bitmap
+ if (i_bm >= bitmaps.size()) {
+ LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
+ __func__, bitmaps.size(), parts.size() - 1);
+ return 1;
+ }
+ const mtmd_bitmap * bitmap = bitmaps[i_bm++];
+ int32_t res = add_media(bitmap);
+ if (res != 0) {
+ return res;
+ }
+ } else {
+ // this is a text part, we should add it as text
+ add_text(part, parse_special);
+ }
+ }
+
+ if (add_special && llama_vocab_get_add_bos(vocab)) {
+ // if first chunk is text, we add BOS token to first text chunk
+ // otherwise, create a new text chunk with BOS token
+ if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ // add BOS token to the beginning of first text chunk
+ cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
+ } else {
+ // create a new text chunk with BOS token at the beginning
+ mtmd_input_chunk bos_chunk{
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
+ {llama_vocab_bos(vocab)},
+ nullptr, // image tokens
+ nullptr, // audio tokens
+ };
+ cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
+ }
+ }
+
+ if (add_special && llama_vocab_get_add_eos(vocab)) {
+ // if last chunk is text, we add EOS token to it
+ add_text({llama_vocab_eos(vocab)});
+ }
+
+ if (i_bm != bitmaps.size()) {
+ LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
+ __func__, bitmaps.size(), parts.size() - 1);
+ return 1;
+ }
+
+ *output = std::move(cur);
+
+ return 0;
+ }
+
+ void add_text(const std::string & txt, bool parse_special) {
+ LOG_DBG("%s: %s\n", __func__, txt.c_str());
+ auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
+ add_text(tokens);
+ }
+
+ void add_text(const std::vector<llama_token> & tokens) {
+ if (tokens.empty()) {
+ return;
+ }
+ // if last entry is also a text chunk, add tokens to it instead of creating new chunk
+ if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ cur.entries.back().tokens_text.insert(
+ cur.entries.back().tokens_text.end(),
+ tokens.begin(),
+ tokens.end());
+ } else {
+ mtmd_input_chunk chunk{
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
+ tokens,
+ nullptr, // image tokens
+ nullptr, // audio tokens
+ };
+ cur.entries.emplace_back(std::move(chunk));
+ }
+ }
+
+ int32_t add_media(const mtmd_bitmap * bitmap) {
+ if (!bitmap->is_audio) {
+ // handle image
+
+ if (!ctx->ctx_v) {
+ LOG_ERR("%s: error: model does not support vision input\n", __func__);
+ return 2;
+ }
+
+ if (!ctx->img_beg.empty()) {
+ add_text(ctx->img_beg, true); // add image begin token
+ }
+
+ // convert mtmd_bitmap to clip_image_u8
+ clip_image_u8_ptr img_u8(clip_image_u8_init());
+ img_u8->nx = bitmap->nx;
+ img_u8->ny = bitmap->ny;
+ img_u8->buf.resize(bitmap->data.size());
+ std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
+
+ // preprocess image
+ clip_image_f32_batch batch_f32;
+ bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
+ if (!ok) {
+ LOG_ERR("Unable to preprocess image\n");
+ return 2;
+ }
+
+ // handle llava-uhd style preprocessing
+ const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
+ if (
+ ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
+ || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
+ ) {
+ const int n_col = batch_f32.grid_x;
+ const int n_row = batch_f32.grid_y;
+ // split batch into chunks of single images
+ // NOTE: batch_f32 will be invalidated after this call
+ auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
+ GGML_ASSERT(chunks.size() > 0);
+
+ auto ov_chunk = std::move(chunks.front());
+ chunks.erase(chunks.begin());
+
+ // add overview image (first)
+ if (ctx->ov_img_first) {
+ add_text(ctx->tok_ov_img_start);
+ cur.entries.emplace_back(std::move(ov_chunk));
+ add_text(ctx->tok_ov_img_end);
+ }
+
+ // add slices (or tiles)
+ if (!chunks.empty()) {
+ GGML_ASSERT((int)chunks.size() == n_row * n_col);
+ add_text(ctx->tok_slices_start);
+ for (int y = 0; y < n_row; y++) {
+ for (int x = 0; x < n_col; x++) {
+ const bool is_last_in_row = (x == n_col - 1);
+ if (!ctx->tok_sli_img_start.empty()) {
+ add_text(ctx->tok_sli_img_start);
+ } else if (!ctx->sli_img_start_tmpl.empty()) {
+ // If using a template to preceed a slice image
+ const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
+ std::unique_ptr<char[]> buf(new char[sz]);
+ std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
+ add_text(std::string(buf.get(), buf.get() + sz - 1), true);
+ }
+ cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
+ add_text(ctx->tok_sli_img_end);
+ if (!is_last_in_row) {
+ add_text(ctx->tok_sli_img_mid);
+ }
+ }
+ if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
+ add_text(ctx->tok_row_end);
+ }
+ }
+ add_text(ctx->tok_slices_end);
+ }
+
+ // add overview image (last)
+ if (!ctx->ov_img_first) {
+ add_text(ctx->tok_ov_img_start);
+ cur.entries.emplace_back(std::move(ov_chunk));
+ add_text(ctx->tok_ov_img_end);
+ }
+
+ } else {
+ size_t n_tokens = 0;
+ for (const auto & entry : batch_f32.entries) {
+ n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
+ }
+
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+ if (mtmd_decode_use_mrope(ctx)) {
+ // for Qwen2VL, we need this information for M-RoPE decoding positions
+ image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
+ image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
+ image_tokens->use_mrope_pos = true;
+ } else {
+ // other models, we only need the total number of tokens
+ image_tokens->nx = n_tokens;
+ image_tokens->ny = 1;
+ }
+ image_tokens->batch_f32 = std::move(batch_f32);
+ image_tokens->id = bitmap->id; // optional
+
+ LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
+ LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
+ LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
+
+ mtmd_input_chunk chunk{
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
+ {}, // text tokens
+ std::move(image_tokens),
+ nullptr, // audio tokens
+ };
+ cur.entries.emplace_back(std::move(chunk));
+ }
+
+ if (!ctx->img_end.empty()) {
+ add_text(ctx->img_end, true); // add image end token
+ }
+
+ } else {
+ // handle audio
+
+ if (!ctx->ctx_a) {
+ LOG_ERR("%s: error: model does not support audio input\n", __func__);
+ return 2;
+ }
+
+ if (bitmap->data.size() == 0) {
+ LOG_ERR("%s: error: empty audio data\n", __func__);
+ return 2;
+ }
+
+ if (!ctx->aud_beg.empty()) {
+ add_text(ctx->aud_beg, true); // add audio begin token
+ }
+
+ // preprocess audio
+ std::vector<mtmd_audio_mel> mel_spec_chunks;
+ const float * samples = (const float *)bitmap->data.data();
+ size_t n_samples = bitmap->data.size() / sizeof(float);
+ bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
+ if (!ok) {
+ LOG_ERR("Unable to preprocess audio\n");
+ return 2;
+ }
+
+ // consider each mel_spec as a separate audio chunk
+ // TODO: maybe support batching, but this may come with memory cost
+ for (auto & mel_spec : mel_spec_chunks) {
+ clip_image_f32_ptr mel_f32(clip_image_f32_init());
+ mel_f32->nx = mel_spec.n_len;
+ mel_f32->ny = mel_spec.n_mel;
+ mel_f32->buf = std::move(mel_spec.data);
+ size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
+
+ clip_image_f32_batch batch_f32;
+ batch_f32.is_audio = true;
+ batch_f32.entries.push_back(std::move(mel_f32));
+
+ mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
+ audio_tokens->n_tokens = n_tokens;
+ audio_tokens->batch_f32 = std::move(batch_f32);
+ audio_tokens->id = bitmap->id; // optional
+
+ LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
+
+ mtmd_input_chunk chunk{
+ MTMD_INPUT_CHUNK_TYPE_AUDIO,
+ {}, // text tokens
+ nullptr, // image tokens
+ std::move(audio_tokens),
+ };
+ cur.entries.emplace_back(std::move(chunk));
+ }
+
+ if (!ctx->aud_end.empty()) {
+ add_text(ctx->aud_end, true); // add audio end token
+ }
+ }
+
+ return 0;
+ }
+
+ std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
+ std::vector<mtmd_input_chunk> chunks;
+
+ for (auto & entry : batch_f32.entries) {
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+ image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
+ image_tokens->ny = 1;
+ image_tokens->batch_f32.entries.push_back(std::move(entry));
+ image_tokens->id = id;
+
+ mtmd_input_chunk chunk{
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
+ {}, // text tokens
+ std::move(image_tokens),
+ nullptr, // audio tokens
+ };
+ chunks.emplace_back(std::move(chunk));
+ }
+
+ return chunks;
+ }
+
+ // for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
+ static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
+ std::vector<std::string> result;
+ if (input.empty()) {
+ return result;
+ }
+ size_t start = 0;
+ size_t pos = 0;
+ while ((pos = input.find(delimiter, start)) != std::string::npos) {
+ if (pos > start) {
+ result.push_back(input.substr(start, pos - start));
+ }
+ result.push_back(delimiter);
+ start = pos + delimiter.length();
+ }
+ if (start < input.length()) {
+ result.push_back(input.substr(start));
+ }
+ return result;
+ }
+
+ // copied from common_tokenize
+ static std::vector<llama_token> mtmd_tokenize_text_internal(
+ const struct llama_vocab * vocab,
+ const std::string & text,
+ bool add_special,
+ bool parse_special) {
+ // upper limit for the number of tokens
+ int n_tokens = text.length() + 2 * add_special;
+ std::vector<llama_token> result(n_tokens);
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+ if (n_tokens < 0) {
+ result.resize(-n_tokens);
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+ GGML_ASSERT(check == -n_tokens);
+ } else {
+ result.resize(n_tokens);
+ }
+ return result;
+ }
+};
+
+int32_t mtmd_tokenize(mtmd_context * ctx,
+ mtmd_input_chunks * output,
+ const mtmd_input_text * text,
+ const mtmd_bitmap ** bitmaps,
+ size_t n_bitmaps) {
+ mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
+ return tokenizer.tokenize(output);
+}
+
+int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
+ return 0;
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ if (!ctx->ctx_v) {
+ LOG_ERR("%s: model does not support vision input\n", __func__);
+ return 1;
+ }
+ return mtmd_encode(ctx, chunk->tokens_image.get());
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ if (!ctx->ctx_a) {
+ LOG_ERR("%s: model does not support audio input\n", __func__);
+ return 1;
+ }
+ int n_mmproj_embd = ctx->n_embd_text;
+ ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
+ bool ok = clip_image_batch_encode(
+ ctx->ctx_a,
+ ctx->n_threads,
+ &chunk->tokens_audio->batch_f32,
+ ctx->image_embd_v.data());
+ return ok ? 0 : 1;
+ }
+
+ LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
+ return 1;
+}
+
+int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
+ clip_ctx * ctx_clip = ctx->ctx_v;
+ if (!ctx_clip) {
+ LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
+ return 1;
+ }
+ int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
+ ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
+ bool ok = false;
+
+ if (clip_is_llava(ctx_clip)
+ || clip_is_minicpmv(ctx_clip)
+ || clip_is_glm(ctx_clip)) {
+ // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
+ const auto & entries = image_tokens->batch_f32.entries;
+ for (size_t i = 0; i < entries.size(); i++) {
+ int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
+ ok = clip_image_encode(
+ ctx_clip,
+ ctx->n_threads,
+ entries[i].get(),
+ ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
+ }
+ } else {
+ ok = clip_image_batch_encode(
+ ctx_clip,
+ ctx->n_threads,
+ &image_tokens->batch_f32,
+ ctx->image_embd_v.data());
+ }
+
+ return ok ? 0 : 1;
+}
+
+float * mtmd_get_output_embd(mtmd_context * ctx) {
+ return ctx->image_embd_v.data();
+}
+
+bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
+ switch (ctx->proj_type_v()) {
+ case PROJECTOR_TYPE_GEMMA3:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool mtmd_decode_use_mrope(mtmd_context * ctx) {
+ switch (ctx->proj_type_v()) {
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_QWEN3VL:
+ case PROJECTOR_TYPE_GLM4V:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool mtmd_support_vision(mtmd_context * ctx) {
+ return ctx->ctx_v != nullptr;
+}
+
+bool mtmd_support_audio(mtmd_context * ctx) {
+ return ctx->ctx_a != nullptr;
+}
+
+int mtmd_get_audio_bitrate(mtmd_context * ctx) {
+ if (!ctx->ctx_a) {
+ return -1;
+ }
+ return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
+}
+
+//
+// public API functions
+//
+
+// mtmd_bitmap
+
+mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
+ uint32_t ny,
+ const unsigned char * data) {
+ mtmd_bitmap * bitmap = new mtmd_bitmap;
+ bitmap->nx = nx;
+ bitmap->ny = ny;
+ size_t data_size = (size_t)nx * ny * 3;
+ bitmap->data.resize(data_size);
+ std::memcpy(bitmap->data.data(), data, data_size);
+ return bitmap;
+}
+
+mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
+ const float * data) {
+ mtmd_bitmap * bitmap = new mtmd_bitmap;
+ bitmap->nx = n_samples;
+ bitmap->ny = 1;
+ bitmap->is_audio = true;
+ size_t data_size = n_samples * sizeof(float);
+ bitmap->data.resize(data_size);
+ std::memcpy(bitmap->data.data(), data, data_size);
+ return bitmap;
+}
+
+uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
+ return bitmap->nx;
+}
+
+uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
+ return bitmap->ny;
+}
+
+const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
+ return bitmap->data.data();
+}
+
+size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
+ return bitmap->data.size();
+}
+
+bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
+ return bitmap->is_audio;
+}
+
+const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
+ return bitmap->id.c_str();
+}
+
+void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
+ if (id) {
+ bitmap->id = std::string(id);
+ } else {
+ bitmap->id.clear();
+ }
+}
+
+void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
+ if (bitmap) {
+ delete bitmap;
+ }
+}
+
+// mtmd_input_chunks
+
+mtmd_input_chunks * mtmd_input_chunks_init() {
+ return new mtmd_input_chunks;
+}
+
+size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
+ return chunks->entries.size();
+}
+
+const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
+ if (idx >= chunks->entries.size()) {
+ return nullptr;
+ }
+ return &chunks->entries[idx];
+}
+
+void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
+ if (chunks) {
+ delete chunks;
+ }
+}
+
+// mtmd_input_chunk
+
+enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
+ return chunk->type;
+}
+
+const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ *n_tokens_output = chunk->tokens_text.size();
+ return chunk->tokens_text.data();
+ }
+ *n_tokens_output = 0;
+ return nullptr;
+}
+
+const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ return chunk->tokens_image.get();
+ }
+ return nullptr;
+}
+
+size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ return chunk->tokens_text.size();
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ return chunk->tokens_audio->n_tokens;
+ } else {
+ GGML_ABORT("invalid chunk type");
+ }
+}
+
+llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ return chunk->tokens_text.size();
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ return chunk->tokens_audio->n_tokens;
+ } else {
+ GGML_ABORT("invalid chunk type");
+ }
+}
+
+const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ return chunk->tokens_image->id.c_str();
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ return chunk->tokens_audio->id.c_str();
+ }
+ return nullptr;
+}
+
+mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
+ mtmd_input_chunk * copy = new mtmd_input_chunk{
+ chunk->type,
+ chunk->tokens_text,
+ nullptr,
+ nullptr,
+ };
+ if (chunk->tokens_image) {
+ // copy the image tokens
+ copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
+ *copy->tokens_image = chunk->tokens_image->clone();
+ }
+ if (chunk->tokens_audio) {
+ // copy the audio tokens
+ copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
+ *copy->tokens_audio = chunk->tokens_audio->clone();
+ }
+ return copy;
+}
+
+void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
+ if (chunk) {
+ delete chunk;
+ }
+}
+
+// mtmd_image_tokens
+
+size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
+ return image_tokens->n_tokens();
+}
+
+size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
+ return image_tokens->nx;
+}
+
+size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
+ return image_tokens->ny;
+}
+
+const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
+ return image_tokens->id.c_str();
+}
+
+llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
+ if (image_tokens->use_mrope_pos) {
+ // for M-RoPE, temporal dimension = max(t,h,w)
+ // t is omitted as we don't support video input
+ return std::max(image_tokens->nx, image_tokens->ny);
+ }
+ return image_tokens->n_tokens();
+}
+
+// test function
+
+mtmd_input_chunks * mtmd_test_create_input_chunks() {
+ mtmd_input_chunks * chunks = mtmd_input_chunks_init();
+ if (!chunks) {
+ return nullptr;
+ }
+
+ // create a text chunk
+ std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
+ mtmd_input_chunk chunk_text{
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
+ std::move(tokens_text),
+ nullptr, // image tokens
+ nullptr, // audio tokens
+ };
+ chunks->entries.emplace_back(std::move(chunk_text));
+
+ // create an image chunk
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+ image_tokens->nx = 4;
+ image_tokens->ny = 4;
+ image_tokens->batch_f32.entries.resize(16);
+ image_tokens->id = "image_1";
+ mtmd_input_chunk chunk_image{
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
+ {}, // text tokens
+ std::move(image_tokens),
+ nullptr, // audio tokens
+ };
+ chunks->entries.emplace_back(std::move(chunk_image));
+
+ return chunks;
+}
+
+void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
+ g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
+ g_logger_state.log_callback_user_data = user_data;
+}