summaryrefslogtreecommitdiff
path: root/llama.cpp/tools/mtmd/mtmd.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/tools/mtmd/mtmd.h')
-rw-r--r--llama.cpp/tools/mtmd/mtmd.h319
1 files changed, 319 insertions, 0 deletions
diff --git a/llama.cpp/tools/mtmd/mtmd.h b/llama.cpp/tools/mtmd/mtmd.h
new file mode 100644
index 0000000..ef25d32
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd.h
@@ -0,0 +1,319 @@
+#ifndef MTMD_H
+#define MTMD_H
+
+#include "ggml.h"
+#include "llama.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+#include <string>
+#include <vector>
+#include <cinttypes>
+#include <memory>
+#endif
+
+/**
+ * libmtmd: A library for multimodal support in llama.cpp.
+ *
+ * WARNING: This API is experimental and subject to many BREAKING CHANGES.
+ * Issues related to API usage may receive lower priority support.
+ *
+ * For the usage, see an example in mtmd-cli.cpp
+ *
+ * For contributors:
+ * - Make sure the C API is aligned with the libllama C API (as in llama.h)
+ * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
+ * - Keep the API minimal, do not expose internal details unless necessary
+ *
+ * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
+ * We encourage human contributors to ensure the quality and reliability of the codebase.
+ */
+
+#ifdef LLAMA_SHARED
+# if defined(_WIN32) && !defined(__MINGW32__)
+# ifdef LLAMA_BUILD
+# define MTMD_API __declspec(dllexport)
+# else
+# define MTMD_API __declspec(dllimport)
+# endif
+# else
+# define MTMD_API __attribute__ ((visibility ("default")))
+# endif
+#else
+# define MTMD_API
+#endif
+
+// deprecated marker, use mtmd_default_marker() instead
+#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum mtmd_input_chunk_type {
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
+ MTMD_INPUT_CHUNK_TYPE_AUDIO,
+};
+
+// opaque types
+struct mtmd_context;
+struct mtmd_bitmap;
+struct mtmd_image_tokens;
+struct mtmd_input_chunk;
+struct mtmd_input_chunks;
+
+struct mtmd_input_text {
+ const char * text;
+ bool add_special;
+ bool parse_special;
+};
+
+//
+// C API
+//
+
+typedef struct mtmd_context mtmd_context;
+typedef struct mtmd_bitmap mtmd_bitmap;
+typedef struct mtmd_image_tokens mtmd_image_tokens;
+typedef struct mtmd_input_chunk mtmd_input_chunk;
+typedef struct mtmd_input_chunks mtmd_input_chunks;
+typedef struct mtmd_input_text mtmd_input_text;
+
+struct mtmd_context_params {
+ bool use_gpu;
+ bool print_timings;
+ int n_threads;
+ const char * image_marker; // deprecated, use media_marker instead
+ const char * media_marker;
+ enum llama_flash_attn_type flash_attn_type;
+ bool warmup; // whether to run a warmup encode pass after initialization
+
+ // limit number of image tokens, only for vision models with dynamic resolution
+ int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
+ int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
+
+ // callback function passed over to mtmd proper
+ ggml_backend_sched_eval_callback cb_eval;
+ void * cb_eval_user_data;
+};
+
+MTMD_API const char * mtmd_default_marker(void);
+
+MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
+
+// initialize the mtmd context
+// return nullptr on failure
+MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+ const struct llama_model * text_model,
+ const struct mtmd_context_params ctx_params);
+
+MTMD_API void mtmd_free(mtmd_context * ctx);
+
+// whether we need to set non-causal mask before llama_decode
+MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+
+// whether the current model use M-RoPE for llama_decode
+MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+
+// whether the current model supports vision input
+MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+
+// whether the current model supports audio input
+MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
+
+// get audio bitrate in Hz, for example 16000 for Whisper
+// return -1 if audio is not supported
+MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
+
+// mtmd_bitmap
+//
+// if bitmap is image:
+// length of data must be nx * ny * 3
+// the data is in RGBRGBRGB... format
+// if bitmap is audio:
+// length of data must be n_samples * sizeof(float)
+// the data is in float format (PCM F32)
+MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
+MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
+MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
+MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
+MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
+MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
+MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
+MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
+// bitmap ID is optional, but useful for KV cache tracking
+// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
+MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
+MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
+
+
+// mtmd_input_chunks
+//
+// this is simply a list of mtmd_input_chunk
+// the elements can only be populated via mtmd_tokenize()
+MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
+MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
+MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
+MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+
+// mtmd_input_chunk
+//
+// the instance will be constructed via mtmd_tokenize()
+// it will be freed along with mtmd_input_chunks
+MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
+MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
+MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
+MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
+// returns nullptr for ID on text chunk
+MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
+MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
+
+// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
+// you can move the chunk ownership to your own code by copying it
+// remember to free the chunk when you are done with it
+MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
+MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
+
+
+// mtmd_image_tokens
+//
+// the instance will be constructed via mtmd_tokenize()
+// it will be freed along with mtmd_input_chunk
+MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
+MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
+MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
+MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
+
+// tokenize an input text prompt and a list of bitmaps (images/audio)
+// the prompt must have the input image marker (default: "<__media__>") in it
+// the default marker is defined by mtmd_default_marker()
+// the marker will be replaced with the image/audio chunk
+// for example:
+// "here is an image: <__media__>\ndescribe it in detail."
+// this will gives 3 chunks:
+// 1. "here is an image: <start_of_image>"
+// 2. (image/audio tokens)
+// 3. "<end_of_image>\ndescribe it in detail."
+// number of bitmaps must be equal to the number of markers in the prompt
+// this function is thread-safe (shared ctx)
+// return values:
+// 0 on success
+// 1 on number of bitmaps not matching the number of markers
+// 2 on image preprocessing error
+MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
+ mtmd_input_chunks * output,
+ const mtmd_input_text * text,
+ const mtmd_bitmap ** bitmaps,
+ size_t n_bitmaps);
+
+// returns 0 on success
+// TODO: deprecate
+MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
+ const mtmd_image_tokens * image_tokens);
+
+// returns 0 on success
+MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
+ const mtmd_input_chunk * chunk);
+
+// get output embeddings from the last encode pass
+// the reading size (in bytes) is equal to:
+// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
+MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
+
+// Set callback for all future logging events.
+// If this is not called, or NULL is supplied, everything is output on stderr.
+MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
+
+/////////////////////////////////////////
+
+// test function, to be used in test-mtmd-c-api.c
+MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+//
+// C++ wrappers
+//
+
+#ifdef __cplusplus
+
+namespace mtmd {
+
+struct mtmd_context_deleter {
+ void operator()(mtmd_context * val) { mtmd_free(val); }
+};
+using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
+
+struct mtmd_bitmap_deleter {
+ void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
+};
+using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
+
+struct mtmd_input_chunks_deleter {
+ void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
+};
+using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
+
+struct mtmd_input_chunk_deleter {
+ void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
+};
+using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
+
+struct bitmap {
+ bitmap_ptr ptr;
+ bitmap() : ptr(nullptr) {}
+ bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
+ bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
+ bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
+ ptr.reset(mtmd_bitmap_init(nx, ny, data));
+ }
+ ~bitmap() = default;
+ uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
+ uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
+ const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
+ size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
+ std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
+ void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
+};
+
+struct bitmaps {
+ std::vector<bitmap> entries;
+ ~bitmaps() = default;
+ // return list of pointers to mtmd_bitmap
+ // example:
+ // auto bitmaps_c_ptr = bitmaps.c_ptr();
+ // int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
+ std::vector<const mtmd_bitmap *> c_ptr() {
+ std::vector<const mtmd_bitmap *> res(entries.size());
+ for (size_t i = 0; i < entries.size(); i++) {
+ res[i] = entries[i].ptr.get();
+ }
+ return res;
+ }
+};
+
+struct input_chunks {
+ input_chunks_ptr ptr;
+ input_chunks() = default;
+ input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
+ ~input_chunks() = default;
+ size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
+ const mtmd_input_chunk * operator[](size_t idx) const {
+ return mtmd_input_chunks_get(ptr.get(), idx);
+ }
+};
+
+} // namespace mtmd
+
+#endif
+
+#endif