1#include "clip.h"
   2#include "clip-impl.h"
   3#include "mtmd.h"
   4#include "mtmd-audio.h"
   5
   6#include "llama.h"
   7
   8// fix problem with std::min and std::max
   9#if defined(_WIN32)
  10#define WIN32_LEAN_AND_MEAN
  11#ifndef NOMINMAX
  12#   define NOMINMAX
  13#endif
  14#include <windows.h>
  15#endif
  16
  17#include <algorithm>
  18#include <cerrno>
  19#include <cstdio>
  20#include <cstdlib>
  21#include <cstring>
  22#include <vector>
  23
  24// represents raw image data, layout is RGBRGBRGB...
  25// length of data must be nx * ny * 3
  26struct mtmd_bitmap {
  27    uint32_t nx;
  28    uint32_t ny;
  29    std::vector<unsigned char> data;
  30    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
  31    bool is_audio = false; // true if the bitmap is audio
  32};
  33
  34struct mtmd_image_tokens {
  35    uint32_t nx; // number of tokens in x direction
  36    uint32_t ny; // number of tokens in y direction
  37    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
  38    uint32_t n_tokens() const { return nx * ny; }
  39    clip_image_f32_batch batch_f32; // preprocessed image patches
  40    std::string id; // optional user-defined ID, useful for KV cache tracking
  41
  42    mtmd_image_tokens clone() {
  43        return mtmd_image_tokens{
  44            nx,
  45            ny,
  46            use_mrope_pos,
  47            batch_f32.clone(),
  48            id
  49        };
  50    }
  51};
  52using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
  53
  54struct mtmd_audio_tokens {
  55    uint32_t n_tokens; // number of tokens
  56    clip_image_f32_batch batch_f32; // preprocessed image patches
  57    std::string id; // optional user-defined ID, useful for KV cache tracking
  58
  59    mtmd_audio_tokens clone() {
  60        return mtmd_audio_tokens{
  61            n_tokens,
  62            batch_f32.clone(),
  63            id
  64        };
  65    }
  66};
  67using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
  68
  69struct mtmd_input_chunk {
  70    mtmd_input_chunk_type type;
  71    std::vector<llama_token> tokens_text;
  72    mtmd_image_tokens_ptr tokens_image;
  73    mtmd_audio_tokens_ptr tokens_audio;
  74};
  75
  76struct mtmd_input_chunks {
  77    std::vector<mtmd_input_chunk> entries;
  78};
  79
  80// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
  81// models not having it (llava-1.6) will process embeddings without any special tokens in-between
  82enum mtmd_slice_tmpl {
  83    MTMD_SLICE_TMPL_NONE,
  84    MTMD_SLICE_TMPL_MINICPMV_2_5,
  85    MTMD_SLICE_TMPL_MINICPMV_2_6,
  86    MTMD_SLICE_TMPL_LLAMA4,
  87    MTMD_SLICE_TMPL_IDEFICS3,
  88    MTMD_SLICE_TMPL_LFM2,
  89};
  90
  91const char * mtmd_default_marker() {
  92    return "<__media__>";
  93}
  94
  95static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
  96    switch (flash_attn_type) {
  97        case LLAMA_FLASH_ATTN_TYPE_AUTO:     return CLIP_FLASH_ATTN_TYPE_AUTO;
  98        case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
  99        case LLAMA_FLASH_ATTN_TYPE_ENABLED:  return CLIP_FLASH_ATTN_TYPE_ENABLED;
 100    }
 101    return CLIP_FLASH_ATTN_TYPE_AUTO;
 102}
 103
 104mtmd_context_params mtmd_context_params_default() {
 105    mtmd_context_params params {
 106        /* use_gpu           */ true,
 107        /* print_timings     */ true,
 108        /* n_threads         */ 4,
 109        /* image_marker      */ MTMD_DEFAULT_IMAGE_MARKER,
 110        /* media_marker      */ mtmd_default_marker(),
 111        /* flash_attn_type   */ LLAMA_FLASH_ATTN_TYPE_AUTO,
 112        /* warmup            */ true,
 113        /* image_min_tokens  */ -1,
 114        /* image_max_tokens  */ -1,
 115        /* cb_eval           */ nullptr,
 116        /* cb_eval_user_data */ nullptr,
 117    };
 118    return params;
 119}
 120
 121struct mtmd_context {
 122    struct clip_ctx * ctx_v; // vision
 123    struct clip_ctx * ctx_a; // audio
 124    const struct llama_model * text_model;
 125    std::vector<float> image_embd_v; // image embedding vector
 126
 127    bool print_timings;
 128    int n_threads;
 129    std::string media_marker;
 130    const int n_embd_text;
 131
 132    // these are not token, but strings used to mark the beginning and end of image/audio embeddings
 133    std::string img_beg;
 134    std::string img_end;
 135    std::string aud_beg;
 136    std::string aud_end;
 137
 138    // for llava-uhd style models, we need special tokens in-between slices
 139    // minicpmv calls them "slices", llama 4 calls them "tiles"
 140    mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
 141    std::vector<llama_token> tok_ov_img_start;  // overview image
 142    std::vector<llama_token> tok_ov_img_end;    // overview image
 143    std::vector<llama_token> tok_slices_start;  // start of all slices
 144    std::vector<llama_token> tok_slices_end;    // end of all slices
 145    std::vector<llama_token> tok_sli_img_start; // single slice start
 146    std::vector<llama_token> tok_sli_img_end;   // single slice end
 147    std::vector<llama_token> tok_sli_img_mid;   // between 2 slices
 148    std::vector<llama_token> tok_row_end;       // end of row
 149    bool        tok_row_end_trail = false;
 150    bool        ov_img_first      = false;
 151
 152    // string template for slice image delimiters with row/col (idefics3)
 153    std::string sli_img_start_tmpl;
 154
 155    std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
 156
 157    // TODO @ngxson : add timings
 158
 159    mtmd_context(const char * mmproj_fname,
 160                   const llama_model * text_model,
 161                   const mtmd_context_params & ctx_params) :
 162        text_model   (text_model),
 163        print_timings(ctx_params.print_timings),
 164        n_threads    (ctx_params.n_threads),
 165        media_marker (ctx_params.media_marker),
 166        n_embd_text  (llama_model_n_embd_inp(text_model))
 167    {
 168        if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
 169            throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
 170        }
 171
 172        if (media_marker.empty()) {
 173            throw std::runtime_error("media_marker must not be empty");
 174        }
 175
 176        clip_context_params ctx_clip_params {
 177            /* use_gpu           */ ctx_params.use_gpu,
 178            /* flash_attn_type   */ CLIP_FLASH_ATTN_TYPE_AUTO,
 179            /* image_min_tokens  */ ctx_params.image_min_tokens,
 180            /* image_max_tokens  */ ctx_params.image_max_tokens,
 181            /* warmup            */ ctx_params.warmup,
 182            /* cb_eval           */ ctx_params.cb_eval,
 183            /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
 184        };
 185
 186        auto res = clip_init(mmproj_fname, ctx_clip_params);
 187        ctx_v = res.ctx_v;
 188        ctx_a = res.ctx_a;
 189        if (!ctx_v && !ctx_a) {
 190            throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
 191        }
 192
 193        // if both vision and audio mmproj are present, we need to validate their n_embd
 194        if (ctx_v && ctx_a) {
 195            int n_embd_v = clip_n_mmproj_embd(ctx_v);
 196            int n_embd_a = clip_n_mmproj_embd(ctx_a);
 197            if (n_embd_v != n_embd_a) {
 198                throw std::runtime_error(string_format(
 199                    "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
 200                    n_embd_v, n_embd_a));
 201            }
 202        }
 203
 204        // since we already validate n_embd of vision and audio mmproj,
 205        // we can safely assume that they are the same
 206        int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
 207        if (n_embd_text != n_embd_clip) {
 208            throw std::runtime_error(string_format(
 209                "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
 210                "hint: you may be using wrong mmproj\n",
 211                n_embd_text, n_embd_clip));
 212        }
 213        if (ctx_v) {
 214            init_vision();
 215        }
 216        if (ctx_a) {
 217            init_audio();
 218        }
 219    }
 220
 221    void init_vision() {
 222        GGML_ASSERT(ctx_v != nullptr);
 223
 224        projector_type proj = clip_get_projector_type(ctx_v);
 225        int minicpmv_version = clip_is_minicpmv(ctx_v);
 226        if (minicpmv_version == 2) {
 227            // minicpmv 2.5 format:
 228            // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
 229            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_5;
 230            tok_ov_img_start  = {lookup_token("<image>")};
 231            tok_ov_img_end    = {lookup_token("</image>")};
 232            tok_slices_start  = {lookup_token("<slice>")};
 233            tok_slices_end    = {lookup_token("</slice>")};
 234            tok_sli_img_start = tok_ov_img_start;
 235            tok_sli_img_end   = tok_ov_img_end;
 236            tok_row_end       = {lookup_token("\n")};
 237            tok_row_end_trail = false; // no trailing end-of-row token
 238            ov_img_first      = true;
 239
 240        } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
 241            // minicpmv 2.6 format:
 242            // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
 243            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
 244            tok_ov_img_start  = {lookup_token("<image>")};
 245            tok_ov_img_end    = {lookup_token("</image>")};
 246            tok_sli_img_start = {lookup_token("<slice>")};
 247            tok_sli_img_end   = {lookup_token("</slice>")};
 248            tok_row_end       = {lookup_token("\n")};
 249            tok_row_end_trail = false; // no trailing end-of-row token
 250            ov_img_first      = true;
 251
 252        } else if (minicpmv_version != 0) {
 253            GGML_ASSERT(false && "unsupported minicpmv version");
 254        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
 255            // llama 4 format:
 256            // <|image_start|>
 257            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
 258            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
 259            //     ... <|tile_y_separator|>   <-- trailing end-of-row token
 260            // <|image|> (overview)           <-- overview image is last
 261            // <|image_end|>
 262            slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4;
 263            tok_ov_img_start  = {lookup_token("<|image|>")};
 264            tok_sli_img_mid   = {lookup_token("<|tile_x_separator|>")};
 265            tok_row_end       = {lookup_token("<|tile_y_separator|>")};
 266            tok_row_end_trail = true; // add trailing end-of-row token
 267            ov_img_first      = false; // overview image is last
 268        }
 269
 270        // set boi/eoi
 271        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
 272            // <start_of_image> ... (image embeddings) ... <end_of_image>
 273            img_beg = "<start_of_image>";
 274            img_end = "<end_of_image>";
 275
 276        } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
 277            // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
 278            slice_tmpl         = MTMD_SLICE_TMPL_IDEFICS3;
 279            tok_ov_img_start   = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
 280            tok_ov_img_end     = {lookup_token("<fake_token_around_image>")};
 281            tok_row_end        = {lookup_token("\n")};
 282            sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
 283
 284        } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
 285            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
 286            img_end = "[IMG_END]";
 287
 288        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
 289            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
 290            img_beg = "<|vision_start|>";
 291            img_end = "<|vision_end|>";
 292
 293        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
 294            // (more details in mtmd_context constructor)
 295            img_beg = "<|image_start|>";
 296            img_end = "<|image_end|>";
 297            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
 298                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
 299
 300        } else if (proj == PROJECTOR_TYPE_INTERNVL) {
 301            // <img> ... (image embeddings) ... </img>
 302            img_beg = "<img>";
 303            img_end = "</img>";
 304
 305        } else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
 306            // <|im_start|> ... (image embeddings) ... <|im_end|>
 307            img_beg = "<|im_start|>";
 308            img_end = "<|im_end|>";
 309
 310        } else if (proj == PROJECTOR_TYPE_LFM2) {
 311            // multi-tile:
 312            //   <|image_start|>
 313            //     <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
 314            //     <|img_thumbnail|> (thumbnail)
 315            //   <|image_end|>
 316            // single-tile:
 317            //   <|image_start|> (image) <|image_end|>
 318            img_beg            = "<|image_start|>";
 319            img_end            = "<|image_end|>";
 320            slice_tmpl         = MTMD_SLICE_TMPL_LFM2;
 321            sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
 322            tok_ov_img_start   = {lookup_token("<|img_thumbnail|>")};
 323            ov_img_first       = false;
 324        } else if (proj == PROJECTOR_TYPE_GLM4V) {
 325            img_beg = "<|begin_of_image|>";
 326            img_end = "<|end_of_image|>";
 327
 328        }
 329    }
 330
 331    void init_audio() {
 332        GGML_ASSERT(ctx_a != nullptr);
 333        projector_type proj = clip_get_projector_type(ctx_a);
 334
 335        LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
 336                "    https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
 337
 338        // set preprocessor
 339        switch (proj) {
 340            case PROJECTOR_TYPE_QWEN2A:
 341            case PROJECTOR_TYPE_QWEN25O:
 342            case PROJECTOR_TYPE_ULTRAVOX:
 343            case PROJECTOR_TYPE_VOXTRAL:
 344            case PROJECTOR_TYPE_GLMA:
 345            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
 346                audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
 347                break;
 348            case PROJECTOR_TYPE_LFM2A:
 349                audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
 350                break;
 351            default:
 352                GGML_ABORT("unsupported audio projector type");
 353        }
 354
 355        // initialize audio preprocessor
 356        audio_preproc->initialize();
 357
 358        // set special tokens
 359        if (proj == PROJECTOR_TYPE_QWEN2A) {
 360            // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
 361            aud_beg = "<|audio_bos|>";
 362            aud_end = "<|audio_eos|>";
 363
 364        } else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
 365            // [BEGIN_AUDIO] ... (embeddings) ...
 366            aud_beg = "[BEGIN_AUDIO]";
 367
 368        } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
 369            // <sound> ... (embeddings) ...
 370            aud_beg = "<sound>";
 371        }
 372    }
 373
 374    // get clip ctx based on chunk type
 375    clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
 376        if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
 377            return ctx_v;
 378        } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
 379            return ctx_a;
 380        }
 381        GGML_ABORT("unknown chunk type");
 382    }
 383
 384    projector_type proj_type_v() const {
 385        return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
 386    }
 387
 388    projector_type proj_type_a() const {
 389        return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
 390    }
 391
 392    ~mtmd_context() {
 393        clip_free(ctx_a);
 394        clip_free(ctx_v);
 395    }
 396
 397private:
 398    llama_token lookup_token(const std::string & token_text) {
 399        const llama_vocab * vocab = llama_model_get_vocab(text_model);
 400        const int n_vocab = llama_vocab_n_tokens(vocab);
 401        for (int i = 0; i < n_vocab; i++) {
 402            if (token_to_piece(vocab, i, true) == token_text) {
 403                return i;
 404            }
 405        }
 406        return LLAMA_TOKEN_NULL;
 407    }
 408
 409    std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
 410        std::string piece;
 411        piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
 412        const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
 413        if (n_chars < 0) {
 414            piece.resize(-n_chars);
 415            int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
 416            GGML_ASSERT(check == -n_chars);
 417        } else {
 418            piece.resize(n_chars);
 419        }
 420        return piece;
 421    }
 422};
 423
 424mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
 425        const struct llama_model * text_model,
 426        const struct mtmd_context_params ctx_params) {
 427    try {
 428        return new mtmd_context(mmproj_fname, text_model, ctx_params);
 429    } catch (const std::exception & e) {
 430        LOG_ERR("%s: error: %s\n", __func__, e.what());
 431        return nullptr;
 432    }
 433}
 434
 435void mtmd_free(mtmd_context * ctx) {
 436    delete ctx;
 437}
 438
 439struct mtmd_tokenizer {
 440    mtmd_context * ctx;
 441    std::vector<const mtmd_bitmap *> bitmaps;
 442
 443    std::string input_text;
 444    bool add_special;
 445    bool parse_special;
 446    const llama_vocab * vocab;
 447
 448    mtmd_input_chunks cur;
 449
 450    mtmd_tokenizer(mtmd_context * ctx,
 451            const mtmd_input_text * text,
 452            const mtmd_bitmap ** bitmaps,
 453            size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
 454        add_special   = text->add_special;
 455        parse_special = text->parse_special;
 456        input_text    = text->text;
 457        vocab         = llama_model_get_vocab(ctx->text_model);
 458
 459        // for compatibility, we convert image marker to media marker
 460        string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
 461    }
 462
 463    int32_t tokenize(mtmd_input_chunks * output) {
 464        cur.entries.clear();
 465        std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
 466        size_t i_bm = 0; // index of the current bitmap
 467        for (auto & part : parts) {
 468            if (part == ctx->media_marker) {
 469                // this is a marker, we should add the next bitmap
 470                if (i_bm >= bitmaps.size()) {
 471                    LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
 472                            __func__, bitmaps.size(), parts.size() - 1);
 473                    return 1;
 474                }
 475                const mtmd_bitmap * bitmap = bitmaps[i_bm++];
 476                int32_t res = add_media(bitmap);
 477                if (res != 0) {
 478                    return res;
 479                }
 480            } else {
 481                // this is a text part, we should add it as text
 482                add_text(part, parse_special);
 483            }
 484        }
 485
 486        if (add_special && llama_vocab_get_add_bos(vocab)) {
 487            // if first chunk is text, we add BOS token to first text chunk
 488            // otherwise, create a new text chunk with BOS token
 489            if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
 490                // add BOS token to the beginning of first text chunk
 491                cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
 492            } else {
 493                // create a new text chunk with BOS token at the beginning
 494                mtmd_input_chunk bos_chunk{
 495                    MTMD_INPUT_CHUNK_TYPE_TEXT,
 496                    {llama_vocab_bos(vocab)},
 497                    nullptr, // image tokens
 498                    nullptr, // audio tokens
 499                };
 500                cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
 501            }
 502        }
 503
 504        if (add_special && llama_vocab_get_add_eos(vocab)) {
 505            // if last chunk is text, we add EOS token to it
 506            add_text({llama_vocab_eos(vocab)});
 507        }
 508
 509        if (i_bm != bitmaps.size()) {
 510            LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
 511                    __func__, bitmaps.size(), parts.size() - 1);
 512            return 1;
 513        }
 514
 515        *output = std::move(cur);
 516
 517        return 0;
 518    }
 519
 520    void add_text(const std::string & txt, bool parse_special) {
 521        LOG_DBG("%s: %s\n", __func__, txt.c_str());
 522        auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
 523        add_text(tokens);
 524    }
 525
 526    void add_text(const std::vector<llama_token> & tokens) {
 527        if (tokens.empty()) {
 528            return;
 529        }
 530        // if last entry is also a text chunk, add tokens to it instead of creating new chunk
 531        if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
 532            cur.entries.back().tokens_text.insert(
 533                                            cur.entries.back().tokens_text.end(),
 534                                            tokens.begin(),
 535                                            tokens.end());
 536        } else {
 537            mtmd_input_chunk chunk{
 538                MTMD_INPUT_CHUNK_TYPE_TEXT,
 539                tokens,
 540                nullptr, // image tokens
 541                nullptr, // audio tokens
 542            };
 543            cur.entries.emplace_back(std::move(chunk));
 544        }
 545    }
 546
 547    int32_t add_media(const mtmd_bitmap * bitmap) {
 548        if (!bitmap->is_audio) {
 549            // handle image
 550
 551            if (!ctx->ctx_v) {
 552                LOG_ERR("%s: error: model does not support vision input\n", __func__);
 553                return 2;
 554            }
 555
 556            if (!ctx->img_beg.empty()) {
 557                add_text(ctx->img_beg, true); // add image begin token
 558            }
 559
 560            // convert mtmd_bitmap to clip_image_u8
 561            clip_image_u8_ptr img_u8(clip_image_u8_init());
 562            img_u8->nx = bitmap->nx;
 563            img_u8->ny = bitmap->ny;
 564            img_u8->buf.resize(bitmap->data.size());
 565            std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
 566
 567            // preprocess image
 568            clip_image_f32_batch batch_f32;
 569            bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
 570            if (!ok) {
 571                LOG_ERR("Unable to preprocess image\n");
 572                return 2;
 573            }
 574
 575            // handle llava-uhd style preprocessing
 576            const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
 577            if (
 578                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
 579                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
 580                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
 581                || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
 582                || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
 583            ) {
 584                const int n_col = batch_f32.grid_x;
 585                const int n_row = batch_f32.grid_y;
 586                // split batch into chunks of single images
 587                // NOTE: batch_f32 will be invalidated after this call
 588                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
 589                GGML_ASSERT(chunks.size() > 0);
 590
 591                auto ov_chunk = std::move(chunks.front());
 592                chunks.erase(chunks.begin());
 593
 594                // add overview image (first)
 595                if (ctx->ov_img_first) {
 596                    add_text(ctx->tok_ov_img_start);
 597                    cur.entries.emplace_back(std::move(ov_chunk));
 598                    add_text(ctx->tok_ov_img_end);
 599                }
 600
 601                // add slices (or tiles)
 602                if (!chunks.empty()) {
 603                    GGML_ASSERT((int)chunks.size() == n_row * n_col);
 604                    add_text(ctx->tok_slices_start);
 605                    for (int y = 0; y < n_row; y++) {
 606                        for (int x = 0; x < n_col; x++) {
 607                            const bool is_last_in_row = (x == n_col - 1);
 608                            if (!ctx->tok_sli_img_start.empty()) {
 609                                add_text(ctx->tok_sli_img_start);
 610                            } else if (!ctx->sli_img_start_tmpl.empty()) {
 611                                // If using a template to preceed a slice image
 612                                const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
 613                                std::unique_ptr<char[]> buf(new char[sz]);
 614                                std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
 615                                add_text(std::string(buf.get(), buf.get() + sz - 1), true);
 616                            }
 617                            cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
 618                            add_text(ctx->tok_sli_img_end);
 619                            if (!is_last_in_row) {
 620                                add_text(ctx->tok_sli_img_mid);
 621                            }
 622                        }
 623                        if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
 624                            add_text(ctx->tok_row_end);
 625                        }
 626                    }
 627                    add_text(ctx->tok_slices_end);
 628                }
 629
 630                // add overview image (last)
 631                if (!ctx->ov_img_first) {
 632                    add_text(ctx->tok_ov_img_start);
 633                    cur.entries.emplace_back(std::move(ov_chunk));
 634                    add_text(ctx->tok_ov_img_end);
 635                }
 636
 637            } else {
 638                size_t n_tokens = 0;
 639                for (const auto & entry : batch_f32.entries) {
 640                    n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
 641                }
 642
 643                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
 644                if (mtmd_decode_use_mrope(ctx)) {
 645                    // for Qwen2VL, we need this information for M-RoPE decoding positions
 646                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
 647                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
 648                    image_tokens->use_mrope_pos = true;
 649                } else {
 650                    // other models, we only need the total number of tokens
 651                    image_tokens->nx = n_tokens;
 652                    image_tokens->ny = 1;
 653                }
 654                image_tokens->batch_f32 = std::move(batch_f32);
 655                image_tokens->id = bitmap->id; // optional
 656
 657                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
 658                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
 659                LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
 660
 661                mtmd_input_chunk chunk{
 662                    MTMD_INPUT_CHUNK_TYPE_IMAGE,
 663                    {}, // text tokens
 664                    std::move(image_tokens),
 665                    nullptr, // audio tokens
 666                };
 667                cur.entries.emplace_back(std::move(chunk));
 668            }
 669
 670            if (!ctx->img_end.empty()) {
 671                add_text(ctx->img_end, true); // add image end token
 672            }
 673
 674        } else {
 675            // handle audio
 676
 677            if (!ctx->ctx_a) {
 678                LOG_ERR("%s: error: model does not support audio input\n", __func__);
 679                return 2;
 680            }
 681
 682            if (bitmap->data.size() == 0) {
 683                LOG_ERR("%s: error: empty audio data\n", __func__);
 684                return 2;
 685            }
 686
 687            if (!ctx->aud_beg.empty()) {
 688                add_text(ctx->aud_beg, true); // add audio begin token
 689            }
 690
 691            // preprocess audio
 692            std::vector<mtmd_audio_mel> mel_spec_chunks;
 693            const float * samples = (const float *)bitmap->data.data();
 694            size_t n_samples = bitmap->data.size() / sizeof(float);
 695            bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
 696            if (!ok) {
 697                LOG_ERR("Unable to preprocess audio\n");
 698                return 2;
 699            }
 700
 701            // consider each mel_spec as a separate audio chunk
 702            // TODO: maybe support batching, but this may come with memory cost
 703            for (auto & mel_spec : mel_spec_chunks) {
 704                clip_image_f32_ptr mel_f32(clip_image_f32_init());
 705                mel_f32->nx  = mel_spec.n_len;
 706                mel_f32->ny  = mel_spec.n_mel;
 707                mel_f32->buf = std::move(mel_spec.data);
 708                size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
 709
 710                clip_image_f32_batch batch_f32;
 711                batch_f32.is_audio = true;
 712                batch_f32.entries.push_back(std::move(mel_f32));
 713
 714                mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
 715                audio_tokens->n_tokens = n_tokens;
 716                audio_tokens->batch_f32 = std::move(batch_f32);
 717                audio_tokens->id = bitmap->id; // optional
 718
 719                LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
 720
 721                mtmd_input_chunk chunk{
 722                    MTMD_INPUT_CHUNK_TYPE_AUDIO,
 723                    {}, // text tokens
 724                    nullptr, // image tokens
 725                    std::move(audio_tokens),
 726                };
 727                cur.entries.emplace_back(std::move(chunk));
 728            }
 729
 730            if (!ctx->aud_end.empty()) {
 731                add_text(ctx->aud_end, true); // add audio end token
 732            }
 733        }
 734
 735        return 0;
 736    }
 737
 738    std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
 739        std::vector<mtmd_input_chunk> chunks;
 740
 741        for (auto & entry : batch_f32.entries) {
 742            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
 743            image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
 744            image_tokens->ny = 1;
 745            image_tokens->batch_f32.entries.push_back(std::move(entry));
 746            image_tokens->id = id;
 747
 748            mtmd_input_chunk chunk{
 749                MTMD_INPUT_CHUNK_TYPE_IMAGE,
 750                {}, // text tokens
 751                std::move(image_tokens),
 752                nullptr, // audio tokens
 753            };
 754            chunks.emplace_back(std::move(chunk));
 755        }
 756
 757        return chunks;
 758    }
 759
 760    // for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
 761    static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
 762        std::vector<std::string> result;
 763        if (input.empty()) {
 764            return result;
 765        }
 766        size_t start = 0;
 767        size_t pos = 0;
 768        while ((pos = input.find(delimiter, start)) != std::string::npos) {
 769            if (pos > start) {
 770                result.push_back(input.substr(start, pos - start));
 771            }
 772            result.push_back(delimiter);
 773            start = pos + delimiter.length();
 774        }
 775        if (start < input.length()) {
 776            result.push_back(input.substr(start));
 777        }
 778        return result;
 779    }
 780
 781    // copied from common_tokenize
 782    static std::vector<llama_token> mtmd_tokenize_text_internal(
 783        const struct llama_vocab * vocab,
 784               const std::string & text,
 785                            bool   add_special,
 786                            bool   parse_special) {
 787        // upper limit for the number of tokens
 788        int n_tokens = text.length() + 2 * add_special;
 789        std::vector<llama_token> result(n_tokens);
 790        n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
 791        if (n_tokens < 0) {
 792            result.resize(-n_tokens);
 793            int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
 794            GGML_ASSERT(check == -n_tokens);
 795        } else {
 796            result.resize(n_tokens);
 797        }
 798        return result;
 799    }
 800};
 801
 802int32_t mtmd_tokenize(mtmd_context * ctx,
 803            mtmd_input_chunks * output,
 804            const mtmd_input_text * text,
 805            const mtmd_bitmap ** bitmaps,
 806            size_t n_bitmaps) {
 807    mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
 808    return tokenizer.tokenize(output);
 809}
 810
 811int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
 812    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
 813        LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
 814        return 0;
 815    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
 816        if (!ctx->ctx_v) {
 817            LOG_ERR("%s: model does not support vision input\n", __func__);
 818            return 1;
 819        }
 820        return mtmd_encode(ctx, chunk->tokens_image.get());
 821    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
 822        if (!ctx->ctx_a) {
 823            LOG_ERR("%s: model does not support audio input\n", __func__);
 824            return 1;
 825        }
 826        int n_mmproj_embd = ctx->n_embd_text;
 827        ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
 828        bool ok = clip_image_batch_encode(
 829            ctx->ctx_a,
 830            ctx->n_threads,
 831            &chunk->tokens_audio->batch_f32,
 832            ctx->image_embd_v.data());
 833        return ok ? 0 : 1;
 834    }
 835
 836    LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
 837    return 1;
 838}
 839
 840int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
 841    clip_ctx * ctx_clip = ctx->ctx_v;
 842    if (!ctx_clip) {
 843        LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
 844        return 1;
 845    }
 846    int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
 847    ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
 848    bool ok = false;
 849
 850    if (clip_is_llava(ctx_clip)
 851        || clip_is_minicpmv(ctx_clip)
 852        || clip_is_glm(ctx_clip)) {
 853        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
 854        const auto & entries = image_tokens->batch_f32.entries;
 855        for (size_t i = 0; i < entries.size(); i++) {
 856            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
 857            ok = clip_image_encode(
 858                ctx_clip,
 859                ctx->n_threads,
 860                entries[i].get(),
 861                ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
 862        }
 863    } else {
 864        ok = clip_image_batch_encode(
 865            ctx_clip,
 866            ctx->n_threads,
 867            &image_tokens->batch_f32,
 868            ctx->image_embd_v.data());
 869    }
 870
 871    return ok ? 0 : 1;
 872}
 873
 874float * mtmd_get_output_embd(mtmd_context * ctx) {
 875    return ctx->image_embd_v.data();
 876}
 877
 878bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
 879    switch (ctx->proj_type_v()) {
 880        case PROJECTOR_TYPE_GEMMA3:
 881            return true;
 882        default:
 883            return false;
 884    }
 885}
 886
 887bool mtmd_decode_use_mrope(mtmd_context * ctx) {
 888    switch (ctx->proj_type_v()) {
 889        case PROJECTOR_TYPE_QWEN2VL:
 890        case PROJECTOR_TYPE_QWEN25VL:
 891        case PROJECTOR_TYPE_QWEN3VL:
 892        case PROJECTOR_TYPE_GLM4V:
 893            return true;
 894        default:
 895            return false;
 896    }
 897}
 898
 899bool mtmd_support_vision(mtmd_context * ctx) {
 900    return ctx->ctx_v != nullptr;
 901}
 902
 903bool mtmd_support_audio(mtmd_context * ctx) {
 904    return ctx->ctx_a != nullptr;
 905}
 906
 907int mtmd_get_audio_bitrate(mtmd_context * ctx) {
 908    if (!ctx->ctx_a) {
 909        return -1;
 910    }
 911    return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
 912}
 913
 914//
 915// public API functions
 916//
 917
 918// mtmd_bitmap
 919
 920mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
 921                               uint32_t ny,
 922                               const unsigned char * data) {
 923    mtmd_bitmap * bitmap = new mtmd_bitmap;
 924    bitmap->nx = nx;
 925    bitmap->ny = ny;
 926    size_t data_size = (size_t)nx * ny * 3;
 927    bitmap->data.resize(data_size);
 928    std::memcpy(bitmap->data.data(), data, data_size);
 929    return bitmap;
 930}
 931
 932mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
 933                                          const float * data) {
 934    mtmd_bitmap * bitmap = new mtmd_bitmap;
 935    bitmap->nx = n_samples;
 936    bitmap->ny = 1;
 937    bitmap->is_audio = true;
 938    size_t data_size = n_samples * sizeof(float);
 939    bitmap->data.resize(data_size);
 940    std::memcpy(bitmap->data.data(), data, data_size);
 941    return bitmap;
 942}
 943
 944uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
 945    return bitmap->nx;
 946}
 947
 948uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
 949    return bitmap->ny;
 950}
 951
 952const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
 953    return bitmap->data.data();
 954}
 955
 956size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
 957    return bitmap->data.size();
 958}
 959
 960bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
 961    return bitmap->is_audio;
 962}
 963
 964const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
 965    return bitmap->id.c_str();
 966}
 967
 968void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
 969    if (id) {
 970        bitmap->id = std::string(id);
 971    } else {
 972        bitmap->id.clear();
 973    }
 974}
 975
 976void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
 977    if (bitmap) {
 978        delete bitmap;
 979    }
 980}
 981
 982// mtmd_input_chunks
 983
 984mtmd_input_chunks * mtmd_input_chunks_init() {
 985    return new mtmd_input_chunks;
 986}
 987
 988size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
 989    return chunks->entries.size();
 990}
 991
 992const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
 993    if (idx >= chunks->entries.size()) {
 994        return nullptr;
 995    }
 996    return &chunks->entries[idx];
 997}
 998
 999void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
1000    if (chunks) {
1001        delete chunks;
1002    }
1003}
1004
1005// mtmd_input_chunk
1006
1007enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
1008    return chunk->type;
1009}
1010
1011const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
1012    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1013        *n_tokens_output = chunk->tokens_text.size();
1014        return chunk->tokens_text.data();
1015    }
1016    *n_tokens_output = 0;
1017    return nullptr;
1018}
1019
1020const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
1021    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1022        return chunk->tokens_image.get();
1023    }
1024    return nullptr;
1025}
1026
1027size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
1028    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1029        return chunk->tokens_text.size();
1030    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1031        return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
1032    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1033        return chunk->tokens_audio->n_tokens;
1034    } else {
1035        GGML_ABORT("invalid chunk type");
1036    }
1037}
1038
1039llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
1040    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1041        return chunk->tokens_text.size();
1042    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1043        return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
1044    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1045        return chunk->tokens_audio->n_tokens;
1046    } else {
1047        GGML_ABORT("invalid chunk type");
1048    }
1049}
1050
1051const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
1052    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1053        return chunk->tokens_image->id.c_str();
1054    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1055        return chunk->tokens_audio->id.c_str();
1056    }
1057    return nullptr;
1058}
1059
1060mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
1061    mtmd_input_chunk * copy = new mtmd_input_chunk{
1062        chunk->type,
1063        chunk->tokens_text,
1064        nullptr,
1065        nullptr,
1066    };
1067    if (chunk->tokens_image) {
1068        // copy the image tokens
1069        copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
1070        *copy->tokens_image = chunk->tokens_image->clone();
1071    }
1072    if (chunk->tokens_audio) {
1073        // copy the audio tokens
1074        copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
1075        *copy->tokens_audio = chunk->tokens_audio->clone();
1076    }
1077    return copy;
1078}
1079
1080void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
1081    if (chunk) {
1082        delete chunk;
1083    }
1084}
1085
1086// mtmd_image_tokens
1087
1088size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
1089    return image_tokens->n_tokens();
1090}
1091
1092size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
1093    return image_tokens->nx;
1094}
1095
1096size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
1097    return image_tokens->ny;
1098}
1099
1100const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
1101    return image_tokens->id.c_str();
1102}
1103
1104llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
1105    if (image_tokens->use_mrope_pos) {
1106        // for M-RoPE, temporal dimension = max(t,h,w)
1107        // t is omitted as we don't support video input
1108        return std::max(image_tokens->nx, image_tokens->ny);
1109    }
1110    return image_tokens->n_tokens();
1111}
1112
1113// test function
1114
1115mtmd_input_chunks * mtmd_test_create_input_chunks() {
1116    mtmd_input_chunks * chunks = mtmd_input_chunks_init();
1117    if (!chunks) {
1118        return nullptr;
1119    }
1120
1121    // create a text chunk
1122    std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
1123    mtmd_input_chunk chunk_text{
1124        MTMD_INPUT_CHUNK_TYPE_TEXT,
1125        std::move(tokens_text),
1126        nullptr, // image tokens
1127        nullptr, // audio tokens
1128    };
1129    chunks->entries.emplace_back(std::move(chunk_text));
1130
1131    // create an image chunk
1132    mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
1133    image_tokens->nx = 4;
1134    image_tokens->ny = 4;
1135    image_tokens->batch_f32.entries.resize(16);
1136    image_tokens->id = "image_1";
1137    mtmd_input_chunk chunk_image{
1138        MTMD_INPUT_CHUNK_TYPE_IMAGE,
1139        {}, // text tokens
1140        std::move(image_tokens),
1141        nullptr, // audio tokens
1142    };
1143    chunks->entries.emplace_back(std::move(chunk_image));
1144
1145    return chunks;
1146}
1147
1148void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
1149    g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
1150    g_logger_state.log_callback_user_data = user_data;
1151}