1#ifndef MTMD_H
  2#define MTMD_H
  3
  4#include "ggml.h"
  5#include "llama.h"
  6
  7#include <stddef.h>
  8#include <stdint.h>
  9#include <stdbool.h>
 10
 11#ifdef __cplusplus
 12#include <string>
 13#include <vector>
 14#include <cinttypes>
 15#include <memory>
 16#endif
 17
 18/**
 19 * libmtmd: A library for multimodal support in llama.cpp.
 20 *
 21 * WARNING: This API is experimental and subject to many BREAKING CHANGES.
 22 *          Issues related to API usage may receive lower priority support.
 23 *
 24 * For the usage, see an example in mtmd-cli.cpp
 25 *
 26 * For contributors:
 27 * - Make sure the C API is aligned with the libllama C API (as in llama.h)
 28 * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
 29 * - Keep the API minimal, do not expose internal details unless necessary
 30 *
 31 * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
 32 * We encourage human contributors to ensure the quality and reliability of the codebase.
 33 */
 34
 35#ifdef LLAMA_SHARED
 36#    if defined(_WIN32) && !defined(__MINGW32__)
 37#        ifdef LLAMA_BUILD
 38#            define MTMD_API __declspec(dllexport)
 39#        else
 40#            define MTMD_API __declspec(dllimport)
 41#        endif
 42#    else
 43#        define MTMD_API __attribute__ ((visibility ("default")))
 44#    endif
 45#else
 46#    define MTMD_API
 47#endif
 48
 49// deprecated marker, use mtmd_default_marker() instead
 50#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
 51
 52#ifdef __cplusplus
 53extern "C" {
 54#endif
 55
 56enum mtmd_input_chunk_type {
 57    MTMD_INPUT_CHUNK_TYPE_TEXT,
 58    MTMD_INPUT_CHUNK_TYPE_IMAGE,
 59    MTMD_INPUT_CHUNK_TYPE_AUDIO,
 60};
 61
 62// opaque types
 63struct mtmd_context;
 64struct mtmd_bitmap;
 65struct mtmd_image_tokens;
 66struct mtmd_input_chunk;
 67struct mtmd_input_chunks;
 68
 69struct mtmd_input_text {
 70    const char * text;
 71    bool add_special;
 72    bool parse_special;
 73};
 74
 75//
 76// C API
 77//
 78
 79typedef struct mtmd_context      mtmd_context;
 80typedef struct mtmd_bitmap       mtmd_bitmap;
 81typedef struct mtmd_image_tokens mtmd_image_tokens;
 82typedef struct mtmd_input_chunk  mtmd_input_chunk;
 83typedef struct mtmd_input_chunks mtmd_input_chunks;
 84typedef struct mtmd_input_text   mtmd_input_text;
 85
 86struct mtmd_context_params {
 87    bool use_gpu;
 88    bool print_timings;
 89    int n_threads;
 90    const char * image_marker; // deprecated, use media_marker instead
 91    const char * media_marker;
 92    enum llama_flash_attn_type flash_attn_type;
 93    bool warmup; // whether to run a warmup encode pass after initialization
 94
 95    // limit number of image tokens, only for vision models with dynamic resolution
 96    int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
 97    int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
 98
 99    // callback function passed over to mtmd proper
100    ggml_backend_sched_eval_callback cb_eval;
101    void * cb_eval_user_data;
102};
103
104MTMD_API const char * mtmd_default_marker(void);
105
106MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
107
108// initialize the mtmd context
109// return nullptr on failure
110MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
111                                            const struct llama_model * text_model,
112                                            const struct mtmd_context_params ctx_params);
113
114MTMD_API void mtmd_free(mtmd_context * ctx);
115
116// whether we need to set non-causal mask before llama_decode
117MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
118
119// whether the current model use M-RoPE for llama_decode
120MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
121
122// whether the current model supports vision input
123MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
124
125// whether the current model supports audio input
126MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
127
128// get audio bitrate in Hz, for example 16000 for Whisper
129// return -1 if audio is not supported
130MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
131
132// mtmd_bitmap
133//
134// if bitmap is image:
135//     length of data must be nx * ny * 3
136//     the data is in RGBRGBRGB... format
137// if bitmap is audio:
138//     length of data must be n_samples * sizeof(float)
139//     the data is in float format (PCM F32)
140MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
141MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
142MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
143MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap);
144MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap);
145MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
146MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap);
147MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
148// bitmap ID is optional, but useful for KV cache tracking
149// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
150MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
151MTMD_API void         mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
152
153
154// mtmd_input_chunks
155//
156// this is simply a list of mtmd_input_chunk
157// the elements can only be populated via mtmd_tokenize()
158MTMD_API mtmd_input_chunks *      mtmd_input_chunks_init(void);
159MTMD_API size_t                   mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
160MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
161MTMD_API void                     mtmd_input_chunks_free(mtmd_input_chunks * chunks);
162
163// mtmd_input_chunk
164//
165// the instance will be constructed via mtmd_tokenize()
166// it will be freed along with mtmd_input_chunks
167MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type        (const mtmd_input_chunk * chunk);
168MTMD_API const llama_token *        mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
169MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
170MTMD_API size_t                     mtmd_input_chunk_get_n_tokens    (const mtmd_input_chunk * chunk);
171// returns nullptr for ID on text chunk
172MTMD_API const char *               mtmd_input_chunk_get_id          (const mtmd_input_chunk * chunk);
173// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
174MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd_input_chunk * chunk);
175
176// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
177// you can move the chunk ownership to your own code by copying it
178// remember to free the chunk when you are done with it
179MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
180MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
181
182
183// mtmd_image_tokens
184//
185// the instance will be constructed via mtmd_tokenize()
186// it will be freed along with mtmd_input_chunk
187MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
188MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
189MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
190MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens); // TODO: deprecate
191// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
192MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate
193
194// tokenize an input text prompt and a list of bitmaps (images/audio)
195// the prompt must have the input image marker (default: "<__media__>") in it
196// the default marker is defined by mtmd_default_marker()
197// the marker will be replaced with the image/audio chunk
198// for example:
199//   "here is an image: <__media__>\ndescribe it in detail."
200//   this will gives 3 chunks:
201//   1. "here is an image: <start_of_image>"
202//   2. (image/audio tokens)
203//   3. "<end_of_image>\ndescribe it in detail."
204// number of bitmaps must be equal to the number of markers in the prompt
205// this function is thread-safe (shared ctx)
206// return values:
207//   0 on success
208//   1 on number of bitmaps not matching the number of markers
209//   2 on image preprocessing error
210MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
211                               mtmd_input_chunks * output,
212                               const mtmd_input_text * text,
213                               const mtmd_bitmap ** bitmaps,
214                               size_t n_bitmaps);
215
216// returns 0 on success
217// TODO: deprecate
218MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
219                             const mtmd_image_tokens * image_tokens);
220
221// returns 0 on success
222MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
223                                   const mtmd_input_chunk * chunk);
224
225// get output embeddings from the last encode pass
226// the reading size (in bytes) is equal to:
227// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
228MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
229
230// Set callback for all future logging events.
231// If this is not called, or NULL is supplied, everything is output on stderr.
232MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
233
234/////////////////////////////////////////
235
236// test function, to be used in test-mtmd-c-api.c
237MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
238
239#ifdef __cplusplus
240} // extern "C"
241#endif
242
243//
244// C++ wrappers
245//
246
247#ifdef __cplusplus
248
249namespace mtmd {
250
251struct mtmd_context_deleter {
252    void operator()(mtmd_context * val) { mtmd_free(val); }
253};
254using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
255
256struct mtmd_bitmap_deleter {
257    void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
258};
259using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
260
261struct mtmd_input_chunks_deleter {
262    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
263};
264using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
265
266struct mtmd_input_chunk_deleter {
267    void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
268};
269using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
270
271struct bitmap {
272    bitmap_ptr ptr;
273    bitmap() : ptr(nullptr) {}
274    bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
275    bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
276    bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
277        ptr.reset(mtmd_bitmap_init(nx, ny, data));
278    }
279    ~bitmap() = default;
280    uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
281    uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
282    const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
283    size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
284    std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
285    void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
286};
287
288struct bitmaps {
289    std::vector<bitmap> entries;
290    ~bitmaps() = default;
291    // return list of pointers to mtmd_bitmap
292    // example:
293    //   auto bitmaps_c_ptr = bitmaps.c_ptr();
294    //   int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
295    std::vector<const mtmd_bitmap *> c_ptr() {
296        std::vector<const mtmd_bitmap *> res(entries.size());
297        for (size_t i = 0; i < entries.size(); i++) {
298            res[i] = entries[i].ptr.get();
299        }
300        return res;
301    }
302};
303
304struct input_chunks {
305    input_chunks_ptr ptr;
306    input_chunks() = default;
307    input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
308    ~input_chunks() = default;
309    size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
310    const mtmd_input_chunk * operator[](size_t idx) const {
311        return mtmd_input_chunks_get(ptr.get(), idx);
312    }
313};
314
315} // namespace mtmd
316
317#endif
318
319#endif