1#ifndef MTMD_H
2#define MTMD_H
3
4#include "ggml.h"
5#include "llama.h"
6
7#include <stddef.h>
8#include <stdint.h>
9#include <stdbool.h>
10
11#ifdef __cplusplus
12#include <string>
13#include <vector>
14#include <cinttypes>
15#include <memory>
16#endif
17
18/**
19 * libmtmd: A library for multimodal support in llama.cpp.
20 *
21 * WARNING: This API is experimental and subject to many BREAKING CHANGES.
22 * Issues related to API usage may receive lower priority support.
23 *
24 * For the usage, see an example in mtmd-cli.cpp
25 *
26 * For contributors:
27 * - Make sure the C API is aligned with the libllama C API (as in llama.h)
28 * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
29 * - Keep the API minimal, do not expose internal details unless necessary
30 *
31 * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
32 * We encourage human contributors to ensure the quality and reliability of the codebase.
33 */
34
35#ifdef LLAMA_SHARED
36# if defined(_WIN32) && !defined(__MINGW32__)
37# ifdef LLAMA_BUILD
38# define MTMD_API __declspec(dllexport)
39# else
40# define MTMD_API __declspec(dllimport)
41# endif
42# else
43# define MTMD_API __attribute__ ((visibility ("default")))
44# endif
45#else
46# define MTMD_API
47#endif
48
49// deprecated marker, use mtmd_default_marker() instead
50#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
51
52#ifdef __cplusplus
53extern "C" {
54#endif
55
56enum mtmd_input_chunk_type {
57 MTMD_INPUT_CHUNK_TYPE_TEXT,
58 MTMD_INPUT_CHUNK_TYPE_IMAGE,
59 MTMD_INPUT_CHUNK_TYPE_AUDIO,
60};
61
62// opaque types
63struct mtmd_context;
64struct mtmd_bitmap;
65struct mtmd_image_tokens;
66struct mtmd_input_chunk;
67struct mtmd_input_chunks;
68
69struct mtmd_input_text {
70 const char * text;
71 bool add_special;
72 bool parse_special;
73};
74
75//
76// C API
77//
78
79typedef struct mtmd_context mtmd_context;
80typedef struct mtmd_bitmap mtmd_bitmap;
81typedef struct mtmd_image_tokens mtmd_image_tokens;
82typedef struct mtmd_input_chunk mtmd_input_chunk;
83typedef struct mtmd_input_chunks mtmd_input_chunks;
84typedef struct mtmd_input_text mtmd_input_text;
85
86struct mtmd_context_params {
87 bool use_gpu;
88 bool print_timings;
89 int n_threads;
90 const char * image_marker; // deprecated, use media_marker instead
91 const char * media_marker;
92 enum llama_flash_attn_type flash_attn_type;
93 bool warmup; // whether to run a warmup encode pass after initialization
94
95 // limit number of image tokens, only for vision models with dynamic resolution
96 int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
97 int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
98
99 // callback function passed over to mtmd proper
100 ggml_backend_sched_eval_callback cb_eval;
101 void * cb_eval_user_data;
102};
103
104MTMD_API const char * mtmd_default_marker(void);
105
106MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
107
108// initialize the mtmd context
109// return nullptr on failure
110MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
111 const struct llama_model * text_model,
112 const struct mtmd_context_params ctx_params);
113
114MTMD_API void mtmd_free(mtmd_context * ctx);
115
116// whether we need to set non-causal mask before llama_decode
117MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
118
119// whether the current model use M-RoPE for llama_decode
120MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
121
122// whether the current model supports vision input
123MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
124
125// whether the current model supports audio input
126MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
127
128// get audio bitrate in Hz, for example 16000 for Whisper
129// return -1 if audio is not supported
130MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
131
132// mtmd_bitmap
133//
134// if bitmap is image:
135// length of data must be nx * ny * 3
136// the data is in RGBRGBRGB... format
137// if bitmap is audio:
138// length of data must be n_samples * sizeof(float)
139// the data is in float format (PCM F32)
140MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
141MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
142MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
143MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
144MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
145MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
146MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
147MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
148// bitmap ID is optional, but useful for KV cache tracking
149// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
150MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
151MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
152
153
154// mtmd_input_chunks
155//
156// this is simply a list of mtmd_input_chunk
157// the elements can only be populated via mtmd_tokenize()
158MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
159MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
160MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
161MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
162
163// mtmd_input_chunk
164//
165// the instance will be constructed via mtmd_tokenize()
166// it will be freed along with mtmd_input_chunks
167MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
168MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
169MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
170MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
171// returns nullptr for ID on text chunk
172MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
173// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
174MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
175
176// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
177// you can move the chunk ownership to your own code by copying it
178// remember to free the chunk when you are done with it
179MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
180MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
181
182
183// mtmd_image_tokens
184//
185// the instance will be constructed via mtmd_tokenize()
186// it will be freed along with mtmd_input_chunk
187MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
188MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
189MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
190MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
191// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
192MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
193
194// tokenize an input text prompt and a list of bitmaps (images/audio)
195// the prompt must have the input image marker (default: "<__media__>") in it
196// the default marker is defined by mtmd_default_marker()
197// the marker will be replaced with the image/audio chunk
198// for example:
199// "here is an image: <__media__>\ndescribe it in detail."
200// this will gives 3 chunks:
201// 1. "here is an image: <start_of_image>"
202// 2. (image/audio tokens)
203// 3. "<end_of_image>\ndescribe it in detail."
204// number of bitmaps must be equal to the number of markers in the prompt
205// this function is thread-safe (shared ctx)
206// return values:
207// 0 on success
208// 1 on number of bitmaps not matching the number of markers
209// 2 on image preprocessing error
210MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
211 mtmd_input_chunks * output,
212 const mtmd_input_text * text,
213 const mtmd_bitmap ** bitmaps,
214 size_t n_bitmaps);
215
216// returns 0 on success
217// TODO: deprecate
218MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
219 const mtmd_image_tokens * image_tokens);
220
221// returns 0 on success
222MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
223 const mtmd_input_chunk * chunk);
224
225// get output embeddings from the last encode pass
226// the reading size (in bytes) is equal to:
227// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
228MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
229
230// Set callback for all future logging events.
231// If this is not called, or NULL is supplied, everything is output on stderr.
232MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
233
234/////////////////////////////////////////
235
236// test function, to be used in test-mtmd-c-api.c
237MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
238
239#ifdef __cplusplus
240} // extern "C"
241#endif
242
243//
244// C++ wrappers
245//
246
247#ifdef __cplusplus
248
249namespace mtmd {
250
251struct mtmd_context_deleter {
252 void operator()(mtmd_context * val) { mtmd_free(val); }
253};
254using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
255
256struct mtmd_bitmap_deleter {
257 void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
258};
259using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
260
261struct mtmd_input_chunks_deleter {
262 void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
263};
264using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
265
266struct mtmd_input_chunk_deleter {
267 void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
268};
269using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
270
271struct bitmap {
272 bitmap_ptr ptr;
273 bitmap() : ptr(nullptr) {}
274 bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
275 bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
276 bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
277 ptr.reset(mtmd_bitmap_init(nx, ny, data));
278 }
279 ~bitmap() = default;
280 uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
281 uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
282 const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
283 size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
284 std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
285 void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
286};
287
288struct bitmaps {
289 std::vector<bitmap> entries;
290 ~bitmaps() = default;
291 // return list of pointers to mtmd_bitmap
292 // example:
293 // auto bitmaps_c_ptr = bitmaps.c_ptr();
294 // int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
295 std::vector<const mtmd_bitmap *> c_ptr() {
296 std::vector<const mtmd_bitmap *> res(entries.size());
297 for (size_t i = 0; i < entries.size(); i++) {
298 res[i] = entries[i].ptr.get();
299 }
300 return res;
301 }
302};
303
304struct input_chunks {
305 input_chunks_ptr ptr;
306 input_chunks() = default;
307 input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
308 ~input_chunks() = default;
309 size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
310 const mtmd_input_chunk * operator[](size_t idx) const {
311 return mtmd_input_chunks_get(ptr.get(), idx);
312 }
313};
314
315} // namespace mtmd
316
317#endif
318
319#endif