summaryrefslogtreecommitdiff
path: root/llama.cpp/tools/mtmd
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/tools/mtmd')
-rw-r--r--llama.cpp/tools/mtmd/CMakeLists.txt96
-rw-r--r--llama.cpp/tools/mtmd/README.md63
-rw-r--r--llama.cpp/tools/mtmd/clip-graph.h117
-rw-r--r--llama.cpp/tools/mtmd/clip-impl.h582
-rw-r--r--llama.cpp/tools/mtmd/clip-model.h389
-rw-r--r--llama.cpp/tools/mtmd/clip.cpp4080
-rw-r--r--llama.cpp/tools/mtmd/clip.h121
-rw-r--r--llama.cpp/tools/mtmd/deprecation-warning.cpp22
-rw-r--r--llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py412
-rw-r--r--llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py280
-rw-r--r--llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py33
-rw-r--r--llama.cpp/tools/mtmd/legacy-models/llava_surgery.py38
-rw-r--r--llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py180
-rw-r--r--llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py892
-rw-r--r--llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py47
-rw-r--r--llama.cpp/tools/mtmd/models/cogvlm.cpp98
-rw-r--r--llama.cpp/tools/mtmd/models/conformer.cpp216
-rw-r--r--llama.cpp/tools/mtmd/models/glm4v.cpp120
-rw-r--r--llama.cpp/tools/mtmd/models/internvl.cpp69
-rw-r--r--llama.cpp/tools/mtmd/models/kimik25.cpp101
-rw-r--r--llama.cpp/tools/mtmd/models/kimivl.cpp63
-rw-r--r--llama.cpp/tools/mtmd/models/llama4.cpp96
-rw-r--r--llama.cpp/tools/mtmd/models/llava.cpp374
-rw-r--r--llama.cpp/tools/mtmd/models/minicpmv.cpp114
-rw-r--r--llama.cpp/tools/mtmd/models/mobilenetv5.cpp451
-rw-r--r--llama.cpp/tools/mtmd/models/models.h118
-rw-r--r--llama.cpp/tools/mtmd/models/pixtral.cpp86
-rw-r--r--llama.cpp/tools/mtmd/models/qwen2vl.cpp183
-rw-r--r--llama.cpp/tools/mtmd/models/qwen3vl.cpp193
-rw-r--r--llama.cpp/tools/mtmd/models/siglip.cpp86
-rw-r--r--llama.cpp/tools/mtmd/models/whisper-enc.cpp115
-rw-r--r--llama.cpp/tools/mtmd/models/youtuvl.cpp179
-rw-r--r--llama.cpp/tools/mtmd/mtmd-audio.cpp730
-rw-r--r--llama.cpp/tools/mtmd/mtmd-audio.h113
-rw-r--r--llama.cpp/tools/mtmd/mtmd-cli.cpp437
-rw-r--r--llama.cpp/tools/mtmd/mtmd-helper.cpp521
-rw-r--r--llama.cpp/tools/mtmd/mtmd-helper.h96
-rw-r--r--llama.cpp/tools/mtmd/mtmd.cpp1151
-rw-r--r--llama.cpp/tools/mtmd/mtmd.h319
-rw-r--r--llama.cpp/tools/mtmd/requirements.txt5
-rw-r--r--llama.cpp/tools/mtmd/test-1.jpegbin0 -> 124071 bytes
-rw-r--r--llama.cpp/tools/mtmd/test-2.mp3bin0 -> 140060 bytes
-rwxr-xr-xllama.cpp/tools/mtmd/tests.sh183
43 files changed, 13569 insertions, 0 deletions
diff --git a/llama.cpp/tools/mtmd/CMakeLists.txt b/llama.cpp/tools/mtmd/CMakeLists.txt
new file mode 100644
index 0000000..02d71f2
--- /dev/null
+++ b/llama.cpp/tools/mtmd/CMakeLists.txt
@@ -0,0 +1,96 @@
+# mtmd
+
+find_package(Threads REQUIRED)
+
+add_library(mtmd
+ mtmd.cpp
+ mtmd-audio.cpp
+ mtmd.h
+ mtmd-helper.cpp
+ mtmd-helper.h
+ clip.cpp
+ clip.h
+ clip-impl.h
+ clip-model.h
+ clip-graph.h
+ models/models.h
+ models/cogvlm.cpp
+ models/conformer.cpp
+ models/glm4v.cpp
+ models/internvl.cpp
+ models/kimivl.cpp
+ models/kimik25.cpp
+ models/llama4.cpp
+ models/llava.cpp
+ models/minicpmv.cpp
+ models/pixtral.cpp
+ models/qwen2vl.cpp
+ models/qwen3vl.cpp
+ models/siglip.cpp
+ models/whisper-enc.cpp
+ models/mobilenetv5.cpp
+ models/youtuvl.cpp
+ )
+
+set_target_properties(mtmd PROPERTIES
+ VERSION ${LLAMA_INSTALL_VERSION}
+ SOVERSION 0
+ MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
+)
+
+target_link_libraries (mtmd PUBLIC ggml llama)
+target_link_libraries (mtmd PRIVATE Threads::Threads)
+target_include_directories(mtmd PUBLIC .)
+target_include_directories(mtmd PRIVATE ../..)
+target_include_directories(mtmd PRIVATE ../../vendor)
+target_compile_features (mtmd PRIVATE cxx_std_17)
+
+if (BUILD_SHARED_LIBS)
+ set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
+ target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
+ target_compile_definitions(mtmd PUBLIC LLAMA_SHARED)
+endif()
+
+set(MTMD_PUBLIC_HEADERS
+ ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
+ )
+
+set_target_properties(mtmd
+ PROPERTIES
+ PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
+
+install(TARGETS mtmd LIBRARY PUBLIC_HEADER)
+
+if (NOT MSVC)
+ # for stb_image.h and miniaudio.h
+ target_compile_options(mtmd PRIVATE -Wno-cast-qual)
+endif()
+
+if (TARGET BUILD_INFO)
+ add_dependencies(mtmd BUILD_INFO)
+ add_dependencies(mtmd-helper BUILD_INFO)
+endif()
+
+# if mtmd is linked against common, we throw an error
+if (TARGET mtmd)
+ get_target_property(libs mtmd LINK_LIBRARIES)
+ if (libs AND "common" IN_LIST libs)
+ message(FATAL_ERROR "mtmd is designed to be a public library.\n"
+ "It must not link against common")
+ endif()
+endif()
+
+add_executable(llama-llava-cli deprecation-warning.cpp)
+add_executable(llama-gemma3-cli deprecation-warning.cpp)
+add_executable(llama-minicpmv-cli deprecation-warning.cpp)
+add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
+
+set(TARGET llama-mtmd-cli)
+add_executable (${TARGET} mtmd-cli.cpp)
+set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
+if(LLAMA_TOOLS_INSTALL)
+ install(TARGETS ${TARGET} RUNTIME)
+endif()
+target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/llama.cpp/tools/mtmd/README.md b/llama.cpp/tools/mtmd/README.md
new file mode 100644
index 0000000..ef31d19
--- /dev/null
+++ b/llama.cpp/tools/mtmd/README.md
@@ -0,0 +1,63 @@
+# Multimodal Support in llama.cpp
+
+This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported.
+
+> [!IMPORTANT]
+>
+> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**.
+
+The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify:
+
+- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction.
+- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure.
+- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users.
+- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs.
+- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`.
+
+## Pre-quantized models
+
+See the list of pre-quantized model [here](../../docs/multimodal.md)
+
+## How it works and what is `mmproj`?
+
+Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model.
+
+This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging.
+
+Consequently, running a multimodal model typically requires two GGUF files:
+1. The standard language model file.
+2. A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection.
+
+## What is `libmtmd`?
+
+As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs.
+
+Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages:
+- **Unified Interface:** Aims to consolidate interaction for various multimodal models.
+- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library.
+- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models.
+
+## How to obtain `mmproj`
+
+Multimodal projector (`mmproj`) files are specific to each model architecture.
+
+For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` flag to get the `mmproj` file:
+- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ; See the guide [here](../../docs/multimodal/gemma3.md) - Note: 1B variant does not have vision support
+- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
+- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
+- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
+- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen))
+- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
+- InternVL 2.5 and InternVL 3 from [OpenGVLab](https://huggingface.co/OpenGVLab) (note: we don't support conversion of `InternVL3-*-hf` model, only non-HF version is supported ; `InternLM2Model` **text** model is not supported)
+
+For older models, please refer to the relevant guide for instructions on how to obtain or create them:
+
+NOTE: conversion scripts are located under `tools/mtmd/legacy-models`
+
+- [LLaVA](../../docs/multimodal/llava.md)
+- [MobileVLM](../../docs/multimodal/MobileVLM.md)
+- [GLM-Edge](../../docs/multimodal/glmedge.md)
+- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md)
+- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md)
+- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
+- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
diff --git a/llama.cpp/tools/mtmd/clip-graph.h b/llama.cpp/tools/mtmd/clip-graph.h
new file mode 100644
index 0000000..4c7f750
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip-graph.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpp.h"
+#include "clip.h"
+#include "clip-impl.h"
+#include "clip-model.h"
+
+#include <vector>
+#include <functional>
+
+#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
+
+struct clip_graph {
+ const clip_model & model;
+ const clip_hparams & hparams;
+ projector_type proj_type;
+
+ // we only support single image per batch
+ const clip_image_f32 & img;
+
+ const int patch_size;
+ const int n_patches_x;
+ const int n_patches_y;
+ const int n_patches;
+ const int n_embd;
+ const int n_head;
+ const int d_head;
+ const int n_layer;
+ const int n_mmproj_embd;
+ const float eps;
+ const float kq_scale;
+ const clip_flash_attn_type flash_attn_type;
+
+ ggml_context_ptr ctx0_ptr;
+ ggml_context * ctx0;
+ ggml_cgraph * gf;
+
+ clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
+
+ virtual ~clip_graph() = default;
+ virtual ggml_cgraph * build() = 0;
+
+ //
+ // utility functions
+ //
+ void cb(ggml_tensor * cur0, const char * name, int il) const;
+
+ // siglip2 naflex
+ ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
+
+ // build vision transformer (ViT) cgraph
+ // this function should cover most of the models
+ // if your model has specific features, you should probably duplicate this function
+ ggml_tensor * build_vit(
+ ggml_tensor * inp,
+ int64_t n_pos,
+ norm_type norm_t,
+ ffn_op_type ffn_t,
+ ggml_tensor * learned_pos_embd,
+ std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
+
+ // build the input after conv2d (inp_raw --> patches)
+ // returns tensor with shape [n_embd, n_patches]
+ ggml_tensor * build_inp();
+
+ ggml_tensor * build_inp_raw(int channels = 3);
+
+ ggml_tensor * build_norm(
+ ggml_tensor * cur,
+ ggml_tensor * mw,
+ ggml_tensor * mb,
+ norm_type type,
+ float norm_eps,
+ int il) const;
+
+ ggml_tensor * build_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * up,
+ ggml_tensor * up_b,
+ ggml_tensor * gate,
+ ggml_tensor * gate_b,
+ ggml_tensor * down,
+ ggml_tensor * down_b,
+ ffn_op_type type_op,
+ int il) const;
+
+ ggml_tensor * build_attn(
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur,
+ ggml_tensor * k_cur,
+ ggml_tensor * v_cur,
+ ggml_tensor * kq_mask,
+ float kq_scale,
+ int il) const;
+
+ // implementation of the 2D RoPE without adding a new op in ggml
+ // this is not efficient (use double the memory), but works on all backends
+ // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
+ ggml_tensor * build_rope_2d(
+ ggml_context * ctx0,
+ ggml_tensor * cur,
+ ggml_tensor * pos_a, // first half
+ ggml_tensor * pos_b, // second half
+ const float freq_base,
+ const bool interleave_freq
+ );
+
+ // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
+ // support dynamic resolution
+ ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
+
+ // Generic function to stack frames for audio processing
+ // Abstracts out the StackAudioFrames logic used by ultravox
+ ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
+};
diff --git a/llama.cpp/tools/mtmd/clip-impl.h b/llama.cpp/tools/mtmd/clip-impl.h
new file mode 100644
index 0000000..3bc93ea
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip-impl.h
@@ -0,0 +1,582 @@
+#pragma once
+
+#include "ggml.h"
+#include "gguf.h"
+#include "clip.h"
+
+#include <climits>
+#include <cstdarg>
+#include <cinttypes>
+#include <string>
+#include <map>
+#include <sstream>
+#include <vector>
+#include <memory>
+
+// Internal header for clip.cpp
+
+#define MTMD_INTERNAL_HEADER
+
+#define KEY_FTYPE "general.file_type"
+#define KEY_NAME "general.name"
+#define KEY_DESCRIPTION "general.description"
+#define KEY_PROJ_TYPE "clip.projector_type"
+#define KEY_HAS_AUDIO_ENC "clip.has_audio_encoder"
+#define KEY_HAS_VISION_ENC "clip.has_vision_encoder"
+#define KEY_USE_GELU "clip.use_gelu"
+#define KEY_USE_SILU "clip.use_silu"
+
+#define KEY_N_EMBD "clip.%s.embedding_length"
+#define KEY_N_FF "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK "clip.%s.block_count"
+#define KEY_PROJ_DIM "clip.%s.projection_dim"
+#define KEY_N_HEAD "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
+
+// vision-specific
+#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
+#define KEY_IMAGE_SIZE "clip.vision.image_size"
+#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels"
+#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels"
+#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
+#define KEY_PATCH_SIZE "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN "clip.vision.image_mean"
+#define KEY_IMAGE_STD "clip.vision.image_std"
+#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
+#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
+#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
+#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
+
+#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
+#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
+#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
+#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
+#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
+
+// audio-specific
+#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
+#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
+#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
+
+
+//
+// tensor name constants
+//
+
+#define TN_POS_EMBD "%s.position_embd.weight"
+#define TN_CLASS_EMBD "v.class_embd"
+#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
+#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
+#define TN_PATCH_BIAS "v.patch_embd.bias"
+#define TN_NORM_EMBD "v.norm_embd.%s"
+#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
+#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
+#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
+#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
+#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
+#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
+#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
+#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
+#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
+#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
+#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
+#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
+#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
+#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
+#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
+#define TN_LN_PRE "%s.pre_ln.%s"
+#define TN_LN_POST "%s.post_ln.%s"
+#define TN_LLAVA_PROJ "mm.%d.%s"
+#define TN_MM_UP "mm.up.%s"
+#define TN_MM_GATE "mm.gate.%s"
+#define TN_MM_DOWN "mm.down.%s"
+#define TN_MM_POST_NORM "mm.post_norm.%s"
+#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
+#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
+#define TN_IMAGE_NEWLINE "model.image_newline"
+#define TN_MM_INP_NORM "mm.input_norm.weight"
+#define TN_MM_INP_NORM_B "mm.input_norm.bias"
+#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
+#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
+#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
+#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v
+#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
+#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
+#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
+#define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack
+#define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack
+#define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack
+
+// mimicpmv
+#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
+#define TN_MINICPMV_QUERY "resampler.query"
+#define TN_MINICPMV_PROJ "resampler.proj.weight"
+#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
+#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
+#define TN_MINICPMV_LN "resampler.ln_%s.%s"
+
+#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
+#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
+#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
+#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
+#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
+#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
+
+// ultravox
+#define TN_CONV1D "a.conv1d.%d.%s"
+#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
+#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer
+#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
+#define TN_MM_NORM_MID "mm.a.norm_mid.%s"
+
+// cogvlm
+#define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s"
+#define TN_MM_H_TO_4H "mm.up.%s"
+#define TN_MM_GATE "mm.gate.%s"
+#define TN_MM_4H_TO_H "mm.down.%s"
+#define TN_TOK_BOI "v.boi"
+#define TN_TOK_EOI "v.eoi"
+
+// (conformer) lfm2
+#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
+#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
+#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s"
+#define TN_FFN_UP_1 "%s.blk.%d.ffn_up_1.%s"
+#define TN_FFN_DOWN_1 "%s.blk.%d.ffn_down_1.%s"
+#define TN_POS_BIAS_U "%s.blk.%d.pos_bias_u"
+#define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v"
+#define TN_NORM_CONV "%s.blk.%d.norm_conv.%s"
+#define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s"
+#define TN_CONV_DW "%s.blk.%d.conv_dw.%s"
+#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s"
+#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
+#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
+
+// mobilenetv5 (gemma3n) definitions
+#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
+#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
+#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight"
+
+// Stage 0 Block (Edge Residual)
+#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight"
+#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight"
+#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight"
+#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight"
+
+// Stage 1+ Block (Universal Inverted Residual)
+#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight"
+#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight"
+#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight"
+#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight"
+#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight"
+#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight"
+#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight"
+#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight"
+#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma"
+
+// Attention Components
+#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight"
+#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight"
+#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight"
+#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight"
+#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight"
+#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight"
+#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight"
+#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight"
+#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
+
+// MSFA
+#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight"
+#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
+#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
+
+
+// align x to upper multiple of n
+#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
+
+// forward declaration
+// TODO: improve this later
+struct clip_ctx;
+
+enum projector_type {
+ PROJECTOR_TYPE_MLP,
+ PROJECTOR_TYPE_MLP_NORM,
+ PROJECTOR_TYPE_LDP,
+ PROJECTOR_TYPE_LDPV2,
+ PROJECTOR_TYPE_MINICPMV,
+ PROJECTOR_TYPE_GLM_EDGE,
+ PROJECTOR_TYPE_QWEN2VL,
+ PROJECTOR_TYPE_QWEN3VL,
+ PROJECTOR_TYPE_GEMMA3,
+ PROJECTOR_TYPE_GEMMA3NV,
+ PROJECTOR_TYPE_GEMMA3NA,
+ PROJECTOR_TYPE_IDEFICS3,
+ PROJECTOR_TYPE_PIXTRAL,
+ PROJECTOR_TYPE_QWEN25VL,
+ PROJECTOR_TYPE_ULTRAVOX,
+ PROJECTOR_TYPE_INTERNVL,
+ PROJECTOR_TYPE_LLAMA4,
+ PROJECTOR_TYPE_QWEN2A,
+ PROJECTOR_TYPE_GLMA,
+ PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
+ PROJECTOR_TYPE_VOXTRAL,
+ PROJECTOR_TYPE_MUSIC_FLAMINGO,
+ PROJECTOR_TYPE_LFM2,
+ PROJECTOR_TYPE_KIMIVL,
+ PROJECTOR_TYPE_LIGHTONOCR,
+ PROJECTOR_TYPE_COGVLM,
+ PROJECTOR_TYPE_JANUS_PRO,
+ PROJECTOR_TYPE_LFM2A,
+ PROJECTOR_TYPE_GLM4V,
+ PROJECTOR_TYPE_YOUTUVL,
+ PROJECTOR_TYPE_KIMIK25,
+ PROJECTOR_TYPE_UNKNOWN,
+};
+
+static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
+ { PROJECTOR_TYPE_MLP, "mlp" },
+ { PROJECTOR_TYPE_LDP, "ldp" },
+ { PROJECTOR_TYPE_LDPV2, "ldpv2"},
+ { PROJECTOR_TYPE_MINICPMV, "resampler"},
+ { PROJECTOR_TYPE_GLM_EDGE, "adapter"},
+ { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
+ { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
+ { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
+ { PROJECTOR_TYPE_GEMMA3, "gemma3"},
+ { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
+ { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
+ { PROJECTOR_TYPE_IDEFICS3, "idefics3"},
+ { PROJECTOR_TYPE_PIXTRAL, "pixtral"},
+ { PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
+ { PROJECTOR_TYPE_INTERNVL, "internvl"},
+ { PROJECTOR_TYPE_LLAMA4, "llama4"},
+ { PROJECTOR_TYPE_QWEN2A, "qwen2a"},
+ { PROJECTOR_TYPE_GLMA, "glma"},
+ { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
+ { PROJECTOR_TYPE_VOXTRAL, "voxtral"},
+ { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
+ { PROJECTOR_TYPE_LFM2, "lfm2"},
+ { PROJECTOR_TYPE_KIMIVL, "kimivl"},
+ { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
+ { PROJECTOR_TYPE_COGVLM, "cogvlm"},
+ { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
+ { PROJECTOR_TYPE_LFM2A, "lfm2a"},
+ { PROJECTOR_TYPE_GLM4V, "glm4v"},
+ { PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
+ { PROJECTOR_TYPE_KIMIK25, "kimik25"},
+};
+
+static projector_type clip_projector_type_from_string(const std::string & str) {
+ for (const auto & pair : PROJECTOR_TYPE_NAMES) {
+ if (pair.second == str) {
+ return pair.first;
+ }
+ }
+ return PROJECTOR_TYPE_UNKNOWN;
+}
+
+// RGB uint8 image
+struct clip_image_u8 {
+ int nx;
+ int ny;
+
+ std::vector<uint8_t> buf;
+};
+
+// For images, buf.size() == nx*ny*3
+// Memory layout: RGBRGBRGB...
+// For audio, only one channel is used, buf.size() == nx*ny
+// nx will be n_frames and ny will be n_mel
+struct clip_image_f32 {
+ int nx;
+ int ny;
+
+ std::vector<float> buf;
+};
+
+//
+// logging
+//
+
+static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
+ (void) level;
+ (void) user_data;
+ fputs(text, stderr);
+ fflush(stderr);
+}
+
+struct clip_logger_state {
+ ggml_log_callback log_callback;
+ void * log_callback_user_data;
+};
+
+extern struct clip_logger_state g_logger_state;
+
+static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
+ if (format == NULL) {
+ return;
+ }
+ va_list args_copy;
+ va_copy(args_copy, args);
+ char buffer[128];
+ int len = vsnprintf(buffer, 128, format, args);
+ if (len < 128) {
+ g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
+ } else {
+ char * buffer2 = (char *) calloc(len + 1, sizeof(char));
+ vsnprintf(buffer2, len + 1, format, args_copy);
+ buffer2[len] = 0;
+ g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
+ free(buffer2);
+ }
+ va_end(args_copy);
+}
+
+static void clip_log_internal(enum ggml_log_level level, const char * format, ...) {
+ va_list args;
+ va_start(args, format);
+ clip_log_internal_v(level, format, args);
+ va_end(args);
+}
+
+#define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
+#define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
+#define LOG_ERR(...) clip_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define LOG_CNT(...) clip_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
+
+//
+// cpp wrappers
+//
+
+// wrapper for clip_image_size
+struct clip_image_size_deleter {
+ void operator()(clip_image_size * val) { clip_image_size_free(val); }
+};
+typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
+
+// wrapper for clip_image_u8
+struct clip_image_u8_deleter {
+ void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
+};
+typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
+
+// wrapper for clip_image_f32
+struct clip_image_f32_deleter {
+ void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
+};
+typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
+
+struct clip_image_u8_batch {
+ std::vector<clip_image_u8_ptr> entries;
+};
+
+struct clip_image_f32_batch {
+ std::vector<clip_image_f32_ptr> entries;
+ bool is_audio = false;
+
+ // for llava-uhd style models, we need to know the grid size
+ // note: entries.size() == grid_x * grid_y + 1 (one overview image)
+ int grid_x = 0;
+ int grid_y = 0;
+
+ clip_image_f32_batch clone() const {
+ clip_image_f32_batch new_batch{
+ /* entries */ {},
+ /* is_audio */ is_audio,
+ /* grid_x */ grid_x,
+ /* grid_y */ grid_y,
+ };
+ new_batch.entries.reserve(entries.size());
+ for (const auto & entry : entries) {
+ new_batch.entries.emplace_back(new clip_image_f32(*entry));
+ }
+ return new_batch;
+ }
+};
+
+//
+// common utils
+//
+
+static std::string string_format(const char * fmt, ...) {
+ va_list ap;
+ va_list ap2;
+ va_start(ap, fmt);
+ va_copy(ap2, ap);
+ int size = vsnprintf(NULL, 0, fmt, ap);
+ GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+ std::vector<char> buf(size + 1);
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+ GGML_ASSERT(size2 == size);
+ va_end(ap2);
+ va_end(ap);
+ return std::string(buf.data(), buf.size());
+}
+
+static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
+ if (search.empty()) {
+ return;
+ }
+ std::string builder;
+ builder.reserve(s.length());
+ size_t pos = 0;
+ size_t last_pos = 0;
+ while ((pos = s.find(search, last_pos)) != std::string::npos) {
+ builder.append(s, last_pos, pos - last_pos);
+ builder.append(replace);
+ last_pos = pos + search.length();
+ }
+ builder.append(s, last_pos, std::string::npos);
+ s = std::move(builder);
+}
+
+// split string by a `std::string delim` instead of `char delim`
+static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
+ std::vector<std::string> tokens;
+ size_t pos = 0;
+ std::string token;
+ while ((pos = s.find(delimiter)) != std::string::npos) {
+ token = s.substr(0, pos);
+ tokens.push_back(token);
+ s.erase(0, pos + delimiter.length());
+ }
+ tokens.push_back(s);
+ return tokens;
+}
+
+//
+// gguf utils
+//
+
+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+ switch (type) {
+ case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
+ case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
+ case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
+ case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
+ case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
+ case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
+ case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
+ case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
+ case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
+ case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
+ case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
+ default: return string_format("unknown type %d", type);
+ }
+}
+
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+ switch (type) {
+ case GGUF_TYPE_STRING:
+ return gguf_get_val_str(ctx_gguf, i);
+ case GGUF_TYPE_ARRAY:
+ {
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+ int arr_n = gguf_get_arr_n(ctx_gguf, i);
+ const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
+ std::stringstream ss;
+ ss << "[";
+ for (int j = 0; j < arr_n; j++) {
+ if (arr_type == GGUF_TYPE_STRING) {
+ std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+ // escape quotes
+ string_replace_all(val, "\\", "\\\\");
+ string_replace_all(val, "\"", "\\\"");
+ ss << '"' << val << '"';
+ } else if (arr_type == GGUF_TYPE_ARRAY) {
+ ss << "???";
+ } else {
+ ss << gguf_data_to_str(arr_type, data, j);
+ }
+ if (j < arr_n - 1) {
+ ss << ", ";
+ }
+ }
+ ss << "]";
+ return ss.str();
+ }
+ default:
+ return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+ }
+}
+
+//
+// debugging
+//
+
+static void print_tensor_shape(ggml_tensor * t) {
+ printf("%s.shape = [", t->name);
+ for (int i = 0; i < ggml_n_dims(t); ++i) {
+ printf("%" PRId64, t->ne[i]);
+ if (i < ggml_n_dims(t) - 1) {
+ printf(", ");
+ }
+ }
+ printf("]\n");
+}
+
+static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
+ ggml_type type = t->type;
+ int64_t * ne = t->ne;
+ size_t * nb = t->nb;
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+ printf("%s.data: [\n", t->name);
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+ if (i2 == n && ne[2] > 2*n) {
+ printf(" ..., \n");
+ i2 = ne[2] - n;
+ }
+ printf(" [\n");
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+ if (i1 == n && ne[1] > 2*n) {
+ printf(" ..., \n");
+ i1 = ne[1] - n;
+ }
+ printf(" [");
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+ if (i0 == n && ne[0] > 2*n) {
+ printf("..., ");
+ i0 = ne[0] - n;
+ }
+ size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+ float v;
+ if (type == GGML_TYPE_F16) {
+ v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+ } else if (type == GGML_TYPE_F32) {
+ v = *(float *) &data[i];
+ } else if (type == GGML_TYPE_I32) {
+ v = (float) *(int32_t *) &data[i];
+ } else if (type == GGML_TYPE_I16) {
+ v = (float) *(int16_t *) &data[i];
+ } else if (type == GGML_TYPE_I8) {
+ v = (float) *(int8_t *) &data[i];
+ } else {
+ GGML_ABORT("fatal error");
+ }
+ printf("%8.4f", v);
+ if (i0 < ne[0] - 1) printf(", ");
+ }
+ printf("],\n");
+ }
+ printf(" ],\n");
+ }
+ printf(" ]\n");
+ }
+}
+
+void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
+
+//
+// API used internally with mtmd
+//
+
+projector_type clip_get_projector_type(const struct clip_ctx * ctx);
diff --git a/llama.cpp/tools/mtmd/clip-model.h b/llama.cpp/tools/mtmd/clip-model.h
new file mode 100644
index 0000000..d4ff915
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip-model.h
@@ -0,0 +1,389 @@
+#pragma once
+
+#include "ggml.h"
+#include "clip.h"
+#include "clip-impl.h"
+
+#include <array>
+#include <vector>
+#include <unordered_set>
+#include <cstdint>
+#include <cmath>
+
+enum ffn_op_type {
+ FFN_GELU,
+ FFN_GELU_ERF,
+ FFN_SILU,
+ FFN_GELU_QUICK,
+};
+
+enum norm_type {
+ NORM_TYPE_NORMAL,
+ NORM_TYPE_RMS,
+};
+
+enum patch_merge_type {
+ PATCH_MERGE_FLAT,
+ PATCH_MERGE_SPATIAL_UNPAD,
+};
+
+struct clip_hparams {
+ int32_t image_size = 0;
+ int32_t patch_size = 0;
+ int32_t n_embd = 0;
+ int32_t n_ff = 0;
+ int32_t projection_dim = 0;
+ int32_t n_head = 0;
+ int32_t n_layer = 0;
+ // idefics3
+ int32_t image_longest_edge = 0;
+ int32_t image_min_pixels = -1;
+ int32_t image_max_pixels = -1;
+ int32_t n_merge = 0; // number of patch merges **per-side**
+
+ float image_mean[3];
+ float image_std[3];
+
+ // for models using dynamic image size, we need to have a smaller image size to warmup
+ // otherwise, user will get OOM everytime they load the model
+ int32_t warmup_image_size = 0;
+ int32_t warmup_audio_size = 3000;
+
+ ffn_op_type ffn_op = FFN_GELU;
+
+ patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
+
+ float eps = 1e-6;
+ float rope_theta = 0.0;
+
+ std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
+ int32_t image_crop_resolution;
+ std::unordered_set<int32_t> vision_feature_layer;
+ int32_t attn_window_size = 0;
+ int32_t n_wa_pattern = 0;
+ std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
+
+ // audio
+ int32_t n_mel_bins = 0; // whisper preprocessor
+ int32_t proj_stack_factor = 0; // ultravox
+
+ // audio-to-mel preprocessor params
+ int32_t audio_chunk_len = -1; // in seconds
+ int32_t audio_sample_rate = -1;
+ int32_t audio_n_fft = -1;
+ int32_t audio_window_len = -1;
+ int32_t audio_hop_len = -1;
+
+ // legacy
+ bool has_llava_projector = false;
+ int minicpmv_version = 0;
+ int32_t minicpmv_query_num = 0; // MiniCPM-V query number
+
+ // custom value provided by user, can be undefined if not set
+ int32_t custom_image_min_tokens = -1;
+ int32_t custom_image_max_tokens = -1;
+
+ void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
+ const int cur_merge = n_merge == 0 ? 1 : n_merge;
+ const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
+ image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
+ image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
+ warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
+ }
+
+ void set_warmup_n_tokens(int n_tokens) {
+ int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
+ GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
+ const int cur_merge = n_merge == 0 ? 1 : n_merge;
+ warmup_image_size = n_tok_per_side * patch_size * cur_merge;
+ // TODO: support warmup size for custom token numbers
+ }
+};
+
+struct clip_layer {
+ // attention
+ ggml_tensor * k_w = nullptr;
+ ggml_tensor * k_b = nullptr;
+ ggml_tensor * q_w = nullptr;
+ ggml_tensor * q_b = nullptr;
+ ggml_tensor * v_w = nullptr;
+ ggml_tensor * v_b = nullptr;
+ ggml_tensor * qkv_w = nullptr;
+ ggml_tensor * qkv_b = nullptr;
+
+ ggml_tensor * o_w = nullptr;
+ ggml_tensor * o_b = nullptr;
+
+ ggml_tensor * k_norm = nullptr;
+ ggml_tensor * q_norm = nullptr;
+
+ // layernorm 1
+ ggml_tensor * ln_1_w = nullptr;
+ ggml_tensor * ln_1_b = nullptr;
+
+ ggml_tensor * ff_up_w = nullptr;
+ ggml_tensor * ff_up_b = nullptr;
+ ggml_tensor * ff_gate_w = nullptr;
+ ggml_tensor * ff_gate_b = nullptr;
+ ggml_tensor * ff_down_w = nullptr;
+ ggml_tensor * ff_down_b = nullptr;
+
+ // layernorm 2
+ ggml_tensor * ln_2_w = nullptr;
+ ggml_tensor * ln_2_b = nullptr;
+
+ // layer scale (no bias)
+ ggml_tensor * ls_1_w = nullptr;
+ ggml_tensor * ls_2_w = nullptr;
+
+ // qwen3vl deepstack merger
+ ggml_tensor * deepstack_norm_w = nullptr;
+ ggml_tensor * deepstack_norm_b = nullptr;
+ ggml_tensor * deepstack_fc1_w = nullptr;
+ ggml_tensor * deepstack_fc1_b = nullptr;
+ ggml_tensor * deepstack_fc2_w = nullptr;
+ ggml_tensor * deepstack_fc2_b = nullptr;
+
+ // lfm2
+ ggml_tensor * ff_norm_w = nullptr;
+ ggml_tensor * ff_norm_b = nullptr;
+ ggml_tensor * ff_norm_1_w = nullptr;
+ ggml_tensor * ff_norm_1_b = nullptr;
+ ggml_tensor * ff_up_1_w = nullptr;
+ ggml_tensor * ff_up_1_b = nullptr;
+ ggml_tensor * ff_down_1_w = nullptr;
+ ggml_tensor * ff_down_1_b = nullptr;
+ ggml_tensor * pos_bias_u = nullptr;
+ ggml_tensor * pos_bias_v = nullptr;
+ ggml_tensor * norm_conv_w = nullptr;
+ ggml_tensor * norm_conv_b = nullptr;
+ ggml_tensor * linear_pos_w = nullptr;
+
+ ggml_tensor * conv_norm_w = nullptr;
+ ggml_tensor * conv_norm_b = nullptr;
+ ggml_tensor * conv_dw_w = nullptr;
+ ggml_tensor * conv_dw_b = nullptr;
+ ggml_tensor * conv_pw1_w = nullptr;
+ ggml_tensor * conv_pw1_b = nullptr;
+ ggml_tensor * conv_pw2_w = nullptr;
+ ggml_tensor * conv_pw2_b = nullptr;
+
+ bool has_deepstack() const {
+ return deepstack_fc1_w != nullptr;
+ }
+};
+
+// Expanded MobileNetV5 block structure for Gemma3n vision encoder
+struct mobilenetv5_block {
+ // Stage 0 (Edge Residual)
+ ggml_tensor * s0_conv_exp_w = nullptr;
+ ggml_tensor * s0_bn1_w = nullptr;
+ ggml_tensor * s0_conv_pwl_w = nullptr;
+ ggml_tensor * s0_bn2_w = nullptr;
+
+ // Stage 1+ (Universal Inverted Residual)
+ ggml_tensor * dw_start_w = nullptr;
+ ggml_tensor * dw_start_bn_w = nullptr;
+
+ ggml_tensor * pw_exp_w = nullptr;
+ ggml_tensor * pw_exp_bn_w = nullptr;
+
+ ggml_tensor * dw_mid_w = nullptr;
+ ggml_tensor * dw_mid_bn_w = nullptr;
+
+ ggml_tensor * pw_proj_w = nullptr;
+ ggml_tensor * pw_proj_bn_w = nullptr;
+
+ ggml_tensor * layer_scale_w = nullptr;
+
+ // Attention (MQA) components
+ ggml_tensor * attn_q_w = nullptr;
+ ggml_tensor * attn_k_w = nullptr;
+ ggml_tensor * attn_v_w = nullptr;
+ ggml_tensor * attn_o_w = nullptr;
+
+ // Optional downsampling/norm in attention
+ ggml_tensor * attn_k_dw_w = nullptr;
+ ggml_tensor * attn_k_norm_w = nullptr;
+ ggml_tensor * attn_v_dw_w = nullptr;
+ ggml_tensor * attn_v_norm_w = nullptr;
+
+ // Block norm (often present in attention blocks)
+ ggml_tensor * attn_norm_w = nullptr;
+};
+
+struct clip_model {
+ clip_modality modality = CLIP_MODALITY_VISION;
+ projector_type proj_type = PROJECTOR_TYPE_MLP;
+ clip_hparams hparams;
+
+ // embeddings
+ ggml_tensor * class_embedding = nullptr;
+ ggml_tensor * patch_embeddings_0 = nullptr;
+ ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
+ ggml_tensor * patch_bias = nullptr;
+ ggml_tensor * position_embeddings = nullptr;
+ ggml_tensor * norm_embd_w = nullptr;
+ ggml_tensor * norm_embd_b = nullptr;
+
+ ggml_tensor * pre_ln_w = nullptr;
+ ggml_tensor * pre_ln_b = nullptr;
+
+ std::vector<clip_layer> layers;
+
+ int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
+
+ ggml_tensor * post_ln_w;
+ ggml_tensor * post_ln_b;
+
+ ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
+ ggml_tensor * mm_fc_w;
+ ggml_tensor * mm_fc_b;
+ ggml_tensor * mm_ffn_up_w = nullptr;
+ ggml_tensor * mm_ffn_up_b = nullptr;
+ ggml_tensor * mm_ffn_gate_w = nullptr;
+ ggml_tensor * mm_ffn_gate_b = nullptr;
+ ggml_tensor * mm_ffn_down_w = nullptr;
+ ggml_tensor * mm_ffn_down_b = nullptr;
+ ggml_tensor * mm_post_norm_w = nullptr;
+ ggml_tensor * mm_post_norm_b = nullptr;
+
+ // LLaVA projection
+ ggml_tensor * mm_input_norm_w = nullptr;
+ ggml_tensor * mm_input_norm_b = nullptr;
+ ggml_tensor * mm_0_w = nullptr;
+ ggml_tensor * mm_0_b = nullptr;
+ ggml_tensor * mm_2_w = nullptr;
+ ggml_tensor * mm_2_b = nullptr;
+
+ ggml_tensor * image_newline = nullptr;
+
+ // Yi type models with mlp+normalization projection
+ ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
+ ggml_tensor * mm_1_b = nullptr;
+ ggml_tensor * mm_3_w = nullptr;
+ ggml_tensor * mm_3_b = nullptr;
+ ggml_tensor * mm_4_w = nullptr;
+ ggml_tensor * mm_4_b = nullptr;
+
+ // GLMV-Edge projection
+ ggml_tensor * mm_model_adapter_conv_w = nullptr;
+ ggml_tensor * mm_model_adapter_conv_b = nullptr;
+
+ // MobileVLM projection
+ ggml_tensor * mm_model_mlp_1_w = nullptr;
+ ggml_tensor * mm_model_mlp_1_b = nullptr;
+ ggml_tensor * mm_model_mlp_3_w = nullptr;
+ ggml_tensor * mm_model_mlp_3_b = nullptr;
+ ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
+ ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
+ ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
+ ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
+ ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
+ ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
+ ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
+ ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
+ ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
+ ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
+ ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
+ ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
+ ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
+ ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
+ ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
+ ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
+ ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
+ ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
+ ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
+ ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
+
+ // MobileVLM_V2 projection
+ ggml_tensor * mm_model_mlp_0_w = nullptr;
+ ggml_tensor * mm_model_mlp_0_b = nullptr;
+ ggml_tensor * mm_model_mlp_2_w = nullptr;
+ ggml_tensor * mm_model_mlp_2_b = nullptr;
+ ggml_tensor * mm_model_peg_0_w = nullptr;
+ ggml_tensor * mm_model_peg_0_b = nullptr;
+
+ // MINICPMV projection
+ ggml_tensor * mm_model_pos_embed_k = nullptr;
+ ggml_tensor * mm_model_query = nullptr;
+ ggml_tensor * mm_model_proj = nullptr;
+ ggml_tensor * mm_model_kv_proj = nullptr;
+ ggml_tensor * mm_model_attn_q_w = nullptr;
+ ggml_tensor * mm_model_attn_q_b = nullptr;
+ ggml_tensor * mm_model_attn_k_w = nullptr;
+ ggml_tensor * mm_model_attn_k_b = nullptr;
+ ggml_tensor * mm_model_attn_v_w = nullptr;
+ ggml_tensor * mm_model_attn_v_b = nullptr;
+ ggml_tensor * mm_model_attn_o_w = nullptr;
+ ggml_tensor * mm_model_attn_o_b = nullptr;
+ ggml_tensor * mm_model_ln_q_w = nullptr;
+ ggml_tensor * mm_model_ln_q_b = nullptr;
+ ggml_tensor * mm_model_ln_kv_w = nullptr;
+ ggml_tensor * mm_model_ln_kv_b = nullptr;
+ ggml_tensor * mm_model_ln_post_w = nullptr;
+ ggml_tensor * mm_model_ln_post_b = nullptr;
+
+ // gemma3
+ ggml_tensor * mm_input_proj_w = nullptr;
+ ggml_tensor * mm_soft_emb_norm_w = nullptr;
+
+ // mobilenetv5 for gemma3n
+ std::vector<mobilenetv5_block> mobilenet_blocks;
+ std::vector<int> mobilenet_stage_ends;
+ ggml_tensor * mobilenet_stem_conv_w = nullptr;
+ ggml_tensor * mobilenet_stem_conv_b = nullptr;
+ ggml_tensor * mobilenet_stem_norm_w = nullptr;
+ ggml_tensor * mm_post_proj_norm_w = nullptr;
+
+ // Multi-Scale Fusion Adapter (MSFA) components
+ ggml_tensor * msfa_concat_conv_w = nullptr;
+ ggml_tensor * msfa_concat_norm_w = nullptr;
+ ggml_tensor * msfa_ffn_expand_w = nullptr;
+ ggml_tensor * msfa_ffn_project_w = nullptr;
+ ggml_tensor * msfa_ffn_expand_bn = nullptr;
+ ggml_tensor * msfa_ffn_project_bn = nullptr;
+
+
+ // pixtral, glm4v
+ ggml_tensor * token_embd_img_break = nullptr;
+ ggml_tensor * mm_patch_merger_w = nullptr;
+ ggml_tensor * mm_patch_merger_b = nullptr;
+
+ // ultravox / whisper encoder
+ ggml_tensor * conv1d_1_w = nullptr;
+ ggml_tensor * conv1d_1_b = nullptr;
+ ggml_tensor * conv1d_2_w = nullptr;
+ ggml_tensor * conv1d_2_b = nullptr;
+ ggml_tensor * mm_norm_pre_w = nullptr;
+ ggml_tensor * mm_norm_pre_b = nullptr;
+ ggml_tensor * mm_norm_mid_w = nullptr;
+
+ // cogvlm
+ ggml_tensor * mm_post_fc_norm_w = nullptr;
+ ggml_tensor * mm_post_fc_norm_b = nullptr;
+ ggml_tensor * mm_h_to_4h_w = nullptr;
+ ggml_tensor * mm_gate_w = nullptr;
+ ggml_tensor * mm_4h_to_h_w = nullptr;
+ ggml_tensor * mm_boi = nullptr;
+ ggml_tensor * mm_eoi = nullptr;
+
+ // lfm2 audio
+ std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
+ std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
+ ggml_tensor * pre_encode_out_w = nullptr;
+ ggml_tensor * pre_encode_out_b = nullptr;
+
+ bool audio_has_avgpool() const {
+ return proj_type == PROJECTOR_TYPE_QWEN2A
+ || proj_type == PROJECTOR_TYPE_VOXTRAL
+ || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
+ }
+
+ bool audio_has_stack_frames() const {
+ return proj_type == PROJECTOR_TYPE_ULTRAVOX
+ || proj_type == PROJECTOR_TYPE_VOXTRAL;
+ }
+};
+
+const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);
diff --git a/llama.cpp/tools/mtmd/clip.cpp b/llama.cpp/tools/mtmd/clip.cpp
new file mode 100644
index 0000000..eeccb4c
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip.cpp
@@ -0,0 +1,4080 @@
+#include "clip.h"
+#include "clip-impl.h"
+#include "clip-model.h"
+#include "clip-graph.h"
+#include "models/models.h"
+
+#include "ggml.h"
+#include "ggml-cpp.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "gguf.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <stdexcept>
+#include <unordered_set>
+#include <vector>
+#include <cinttypes>
+#include <limits>
+#include <array>
+#include <functional>
+
+struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
+
+//#define CLIP_DEBUG_FUNCTIONS
+
+#ifdef CLIP_DEBUG_FUNCTIONS
+static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
+ std::ofstream file(filename, std::ios::binary);
+ if (!file.is_open()) {
+ LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
+ return;
+ }
+
+ // PPM header: P6 format, width, height, and max color value
+ file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
+
+ // Write pixel data
+ for (size_t i = 0; i < img.buf.size(); i += 3) {
+ // PPM expects binary data in RGB format, which matches our image buffer
+ file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
+ }
+
+ file.close();
+}
+
+static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
+ std::ofstream file(filename, std::ios::binary);
+ if (!file.is_open()) {
+ LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
+ return;
+ }
+
+ int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
+ int bytesPerPixel = 3;
+ int widthInBytes = img.nx * bytesPerPixel;
+ int paddingAmount = (4 - (widthInBytes % 4)) % 4;
+ int stride = widthInBytes + paddingAmount;
+
+ // Bitmap file header
+ unsigned char fileHeader[14] = {
+ 'B','M', // Signature
+ 0,0,0,0, // Image file size in bytes
+ 0,0,0,0, // Reserved
+ 54,0,0,0 // Start of pixel array
+ };
+
+ // Total file size
+ fileSize = 54 + (stride * img.ny);
+ fileHeader[2] = (unsigned char)(fileSize);
+ fileHeader[3] = (unsigned char)(fileSize >> 8);
+ fileHeader[4] = (unsigned char)(fileSize >> 16);
+ fileHeader[5] = (unsigned char)(fileSize >> 24);
+
+ // Bitmap information header (BITMAPINFOHEADER)
+ unsigned char infoHeader[40] = {
+ 40,0,0,0, // Size of this header (40 bytes)
+ 0,0,0,0, // Image width
+ 0,0,0,0, // Image height
+ 1,0, // Number of color planes
+ 24,0, // Bits per pixel
+ 0,0,0,0, // No compression
+ 0,0,0,0, // Image size (can be 0 for no compression)
+ 0,0,0,0, // X pixels per meter (not specified)
+ 0,0,0,0, // Y pixels per meter (not specified)
+ 0,0,0,0, // Total colors (color table not used)
+ 0,0,0,0 // Important colors (all are important)
+ };
+
+ // Width and height in the information header
+ infoHeader[4] = (unsigned char)(img.nx);
+ infoHeader[5] = (unsigned char)(img.nx >> 8);
+ infoHeader[6] = (unsigned char)(img.nx >> 16);
+ infoHeader[7] = (unsigned char)(img.nx >> 24);
+ infoHeader[8] = (unsigned char)(img.ny);
+ infoHeader[9] = (unsigned char)(img.ny >> 8);
+ infoHeader[10] = (unsigned char)(img.ny >> 16);
+ infoHeader[11] = (unsigned char)(img.ny >> 24);
+
+ // Write file headers
+ file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
+ file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
+
+ // Pixel data
+ std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
+ for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
+ for (int x = 0; x < img.nx; ++x) {
+ // Each pixel
+ size_t pixelIndex = (y * img.nx + x) * 3;
+ unsigned char pixel[3] = {
+ img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
+ img.buf[pixelIndex + 1],
+ img.buf[pixelIndex]
+ };
+ file.write(reinterpret_cast<char*>(pixel), 3);
+ }
+ // Write padding for the row
+ file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
+ }
+
+ file.close();
+}
+
+// debug function to convert f32 to u8
+static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
+ dst.nx = src.nx;
+ dst.ny = src.ny;
+ dst.buf.resize(3 * src.nx * src.ny);
+ for (size_t i = 0; i < src.buf.size(); ++i) {
+ dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
+ }
+}
+#endif
+
+
+struct clip_ctx {
+ clip_model model;
+
+ gguf_context_ptr ctx_gguf;
+ ggml_context_ptr ctx_data;
+
+ std::vector<uint8_t> buf_compute_meta;
+
+ std::vector<ggml_backend_t> backend_ptrs;
+ std::vector<ggml_backend_buffer_type_t> backend_buft;
+
+ ggml_backend_t backend = nullptr;
+ ggml_backend_t backend_cpu = nullptr;
+ ggml_backend_buffer_ptr buf;
+
+
+ int max_nodes = 8192;
+ ggml_backend_sched_ptr sched;
+ clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
+ bool is_allocated = false;
+
+ clip_ctx(clip_context_params & ctx_params) {
+ flash_attn_type = ctx_params.flash_attn_type;
+ backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+ if (!backend_cpu) {
+ throw std::runtime_error("failed to initialize CPU backend");
+ }
+ if (ctx_params.use_gpu) {
+ auto backend_name = std::getenv("MTMD_BACKEND_DEVICE");
+ if (backend_name != nullptr) {
+ backend = ggml_backend_init_by_name(backend_name, nullptr);
+ if (!backend) {
+ LOG_WRN("%s: Warning: Failed to initialize \"%s\" backend, falling back to default GPU backend\n", __func__, backend_name);
+ }
+ }
+ if (!backend) {
+ backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
+ backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
+ }
+ }
+
+ if (backend) {
+ LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
+ backend_ptrs.push_back(backend);
+ backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
+ } else {
+ backend = backend_cpu;
+ LOG_INF("%s: CLIP using CPU backend\n", __func__);
+ }
+
+ if (ctx_params.image_min_tokens > 0) {
+ model.hparams.custom_image_min_tokens = ctx_params.image_min_tokens;
+ }
+ if (ctx_params.image_max_tokens > 0) {
+ model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
+ }
+
+ backend_ptrs.push_back(backend_cpu);
+ backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
+
+ sched.reset(
+ ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
+ );
+
+ if (ctx_params.cb_eval != nullptr) {
+ ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
+ }
+ }
+
+ ~clip_ctx() {
+ ggml_backend_free(backend);
+ if (backend != backend_cpu) {
+ ggml_backend_free(backend_cpu);
+ }
+ }
+
+ // this function is added so that we don't change too much of the existing code
+ projector_type proj_type() const {
+ return model.proj_type;
+ }
+};
+
+//
+// clip_graph
+//
+
+clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
+ model(ctx->model),
+ hparams(model.hparams),
+ proj_type(ctx->proj_type()),
+ img(img),
+ patch_size(hparams.patch_size),
+ n_patches_x(img.nx / patch_size),
+ n_patches_y(img.ny / patch_size),
+ n_patches(n_patches_x * n_patches_y),
+ n_embd(hparams.n_embd),
+ n_head(hparams.n_head),
+ d_head(n_embd / n_head),
+ n_layer(hparams.n_layer),
+ n_mmproj_embd(clip_n_mmproj_embd(ctx)),
+ eps(hparams.eps),
+ kq_scale(1.0f / sqrtf((float)d_head)),
+ flash_attn_type(ctx->flash_attn_type) {
+ struct ggml_init_params params = {
+ /*.mem_size =*/ ctx->buf_compute_meta.size(),
+ /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
+ /*.no_alloc =*/ true,
+ };
+ ctx0_ptr.reset(ggml_init(params));
+ ctx0 = ctx0_ptr.get();
+ gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
+}
+
+void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
+ if (il >= 0) {
+ ggml_format_name(cur, "%s-%d", name, il);
+ } else {
+ ggml_set_name(cur, name);
+ }
+}
+
+// siglip2 naflex
+ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
+ ggml_tensor * pos_embd = model.position_embeddings;
+ const int height = img.ny / patch_size;
+ const int width = img.nx / patch_size;
+ const uint32_t mode = interpolation_mode;
+ const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
+
+ GGML_ASSERT(pos_embd);
+
+ if (height == n_per_side && width == n_per_side) {
+ return pos_embd;
+ }
+
+ pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side)
+ pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd)
+ pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd)
+ pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height)
+ pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height)
+
+ return pos_embd;
+}
+
+// build vision transformer (ViT) cgraph
+// this function should cover most of the models
+// if your model has specific features, you should probably duplicate this function
+ggml_tensor * clip_graph::build_vit(
+ ggml_tensor * inp,
+ int64_t n_pos,
+ norm_type norm_t,
+ ffn_op_type ffn_t,
+ ggml_tensor * learned_pos_embd,
+ std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
+ ) {
+ if (learned_pos_embd) {
+ inp = ggml_add(ctx0, inp, learned_pos_embd);
+ cb(inp, "pos_embed", -1);
+ }
+
+ ggml_tensor * inpL = inp;
+
+ // pre-layernorm
+ if (model.pre_ln_w) {
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+ cb(inpL, "pre_ln", -1);
+ }
+
+ // loop over layers
+ for (int il = 0; il < n_layer; il++) {
+ auto & layer = model.layers[il];
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+ // layernorm1
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+ cb(cur, "layer_inp_normed", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+ if (layer.qkv_w != nullptr) {
+ // fused qkv
+ cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+ if (layer.qkv_b != nullptr) {
+ cur = ggml_add(ctx0, cur, layer.qkv_b);
+ }
+
+ Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ 0);
+
+ Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ ggml_row_size(cur->type, n_embd));
+
+ Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ ggml_row_size(cur->type, 2 * n_embd));
+
+ // TODO: q/k norm requires row size == n_embd, while here it's d_head
+ // we can add support in the future if needed
+ GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
+
+ } else {
+ // separate q, k, v
+ Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+ if (layer.q_b) {
+ Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+ }
+
+ Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+ if (layer.k_b) {
+ Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+ }
+
+ Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+ if (layer.v_b) {
+ Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+ }
+
+ if (layer.q_norm) {
+ Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+ cb(Qcur, "Qcur_norm", il);
+ }
+
+ if (layer.k_norm) {
+ Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+ cb(Kcur, "Kcur_norm", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ if (add_pos) {
+ Qcur = add_pos(Qcur, layer);
+ Kcur = add_pos(Kcur, layer);
+ cb(Qcur, "Qcur_pos", il);
+ cb(Kcur, "Kcur_pos", il);
+ }
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ if (layer.ls_1_w) {
+ cur = ggml_mul(ctx0, cur, layer.ls_1_w);
+ cb(cur, "attn_out_scaled", il);
+ }
+
+ // re-add the layer input, e.g., residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ inpL = cur; // inpL = residual, cur = hidden_states
+
+ cb(cur, "ffn_inp", il);
+
+ // layernorm2
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+ cb(cur, "ffn_inp_normed", il);
+
+ // ffn
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ layer.ff_gate_w, layer.ff_gate_b,
+ layer.ff_down_w, layer.ff_down_b,
+ ffn_t, il);
+
+ cb(cur, "ffn_out", il);
+
+ if (layer.ls_2_w) {
+ cur = ggml_mul(ctx0, cur, layer.ls_2_w);
+ cb(cur, "ffn_out_scaled", il);
+ }
+
+ // residual 2
+ cur = ggml_add(ctx0, inpL, cur);
+ cb(cur, "layer_out", il);
+
+ inpL = cur;
+ }
+
+ if (model.audio_has_avgpool()) {
+ ggml_tensor * cur = inpL;
+ cur = ggml_transpose(ctx0, cur);
+ cur = ggml_cont(ctx0, cur);
+ cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0);
+ cur = ggml_transpose(ctx0, cur);
+ cur = ggml_cont(ctx0, cur);
+ inpL = cur;
+ }
+
+ // post-layernorm
+ if (model.post_ln_w) {
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
+ }
+ return inpL;
+}
+
+// build the input after conv2d (inp_raw --> patches)
+// returns tensor with shape [n_embd, n_patches]
+ggml_tensor * clip_graph::build_inp() {
+ ggml_tensor * inp_raw = build_inp_raw();
+ ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+ inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
+ inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+ if (model.patch_bias) {
+ inp = ggml_add(ctx0, inp, model.patch_bias);
+ cb(inp, "patch_bias", -1);
+ }
+ return inp;
+}
+
+ggml_tensor * clip_graph::build_inp_raw(int channels) {
+ ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
+ ggml_set_name(inp_raw, "inp_raw");
+ ggml_set_input(inp_raw);
+ return inp_raw;
+}
+
+ggml_tensor * clip_graph::build_norm(
+ ggml_tensor * cur,
+ ggml_tensor * mw,
+ ggml_tensor * mb,
+ norm_type type,
+ float norm_eps,
+ int il) const {
+
+ cur = type == NORM_TYPE_RMS
+ ? ggml_rms_norm(ctx0, cur, norm_eps)
+ : ggml_norm(ctx0, cur, norm_eps);
+
+ if (mw) {
+ cur = ggml_mul(ctx0, cur, mw);
+ cb(cur, "norm_w", il);
+ }
+
+ if (mb) {
+ cur = ggml_add(ctx0, cur, mb);
+ cb(cur, "norm_b", il);
+ }
+
+ return cur;
+}
+
+ggml_tensor * clip_graph::build_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * up,
+ ggml_tensor * up_b,
+ ggml_tensor * gate,
+ ggml_tensor * gate_b,
+ ggml_tensor * down,
+ ggml_tensor * down_b,
+ ffn_op_type type_op,
+ int il) const {
+
+ ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
+ cb(tmp, "ffn_up", il);
+
+ if (up_b) {
+ tmp = ggml_add(ctx0, tmp, up_b);
+ cb(tmp, "ffn_up_b", il);
+ }
+
+ if (gate) {
+ cur = ggml_mul_mat(ctx0, gate, cur);
+ cb(cur, "ffn_gate", il);
+
+ if (gate_b) {
+ cur = ggml_add(ctx0, cur, gate_b);
+ cb(cur, "ffn_gate_b", il);
+ }
+ } else {
+ cur = tmp;
+ }
+
+ // we only support parallel ffn for now
+ switch (type_op) {
+ case FFN_SILU:
+ if (gate) {
+ cur = ggml_swiglu_split(ctx0, cur, tmp);
+ cb(cur, "ffn_swiglu", il);
+ } else {
+ cur = ggml_silu(ctx0, cur);
+ cb(cur, "ffn_silu", il);
+ } break;
+ case FFN_GELU:
+ if (gate) {
+ cur = ggml_geglu_split(ctx0, cur, tmp);
+ cb(cur, "ffn_geglu", il);
+ } else {
+ cur = ggml_gelu(ctx0, cur);
+ cb(cur, "ffn_gelu", il);
+ } break;
+ case FFN_GELU_ERF:
+ if (gate) {
+ cur = ggml_geglu_erf_split(ctx0, cur, tmp);
+ cb(cur, "ffn_geglu_erf", il);
+ } else {
+ cur = ggml_gelu_erf(ctx0, cur);
+ cb(cur, "ffn_gelu_erf", il);
+ } break;
+ case FFN_GELU_QUICK:
+ if (gate) {
+ cur = ggml_geglu_quick_split(ctx0, cur, tmp);
+ cb(cur, "ffn_geglu_quick", il);
+ } else {
+ cur = ggml_gelu_quick(ctx0, cur);
+ cb(cur, "ffn_gelu_quick", il);
+ } break;
+ }
+
+ if (down) {
+ cur = ggml_mul_mat(ctx0, down, cur);
+ }
+
+ if (down_b) {
+ cb(cur, "ffn_down", il);
+ }
+
+ if (down_b) {
+ cur = ggml_add(ctx0, cur, down_b);
+ }
+
+ return cur;
+}
+
+ggml_tensor * clip_graph::build_attn(
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur,
+ ggml_tensor * k_cur,
+ ggml_tensor * v_cur,
+ ggml_tensor * kq_mask,
+ float kq_scale,
+ int il) const {
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ ggml_build_forward_expand(gf, q_cur);
+ ggml_build_forward_expand(gf, k_cur);
+ ggml_build_forward_expand(gf, v_cur);
+
+ ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+ //cb(q, "q", il);
+
+ ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+ //cb(k, "k", il);
+
+ ggml_tensor * cur;
+
+ if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+ ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
+
+ k = ggml_cast(ctx0, k, GGML_TYPE_F16);
+ v = ggml_cast(ctx0, v, GGML_TYPE_F16);
+
+ cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f);
+ ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
+
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
+
+ } else {
+ ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
+ v = ggml_cont(ctx0, v);
+
+ const auto n_tokens = q->ne[1];
+ const auto n_head = q->ne[2];
+
+ ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ // F32 may not needed for vision encoders?
+ // ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+
+ kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
+
+ ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+ cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
+ }
+
+ cb(cur, "kqv_out", il);
+
+ if (wo) {
+ cur = ggml_mul_mat(ctx0, wo, cur);
+ }
+
+ if (wo_b) {
+ cur = ggml_add(ctx0, cur, wo_b);
+ }
+
+ return cur;
+}
+
+// implementation of the 2D RoPE without adding a new op in ggml
+// this is not efficient (use double the memory), but works on all backends
+// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
+ggml_tensor * clip_graph::build_rope_2d(
+ ggml_context * ctx0,
+ ggml_tensor * cur,
+ ggml_tensor * pos_a, // first half
+ ggml_tensor * pos_b, // second half
+ const float freq_base,
+ const bool interleave_freq
+) {
+ const int64_t n_dim = cur->ne[0];
+ const int64_t n_head = cur->ne[1];
+ const int64_t n_pos = cur->ne[2];
+
+ // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
+ // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
+ // first half of cur will use 1e-0, 1e-2 (even)
+ // second half of cur will use 1e-1, 1e-3 (odd)
+ // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
+ // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
+ // then for the second half, we use freq_scale to shift the inv_freq
+ // ^ why? replace (2i) with (2i+1) in the above equation
+ const float freq_scale_odd = interleave_freq
+ ? std::pow(freq_base, (float)-2/n_dim)
+ : 1.0;
+
+ // first half
+ ggml_tensor * first;
+ {
+ first = ggml_view_3d(ctx0, cur,
+ n_dim/2, n_head, n_pos,
+ cur->nb[1],
+ cur->nb[2],
+ 0);
+ first = ggml_rope_ext(
+ ctx0,
+ first,
+ pos_a, // positions
+ nullptr, // freq factors
+ n_dim/2, // n_dims
+ 0, 0, freq_base,
+ 1.0f, 0.0f, 1.0f, 0.0f, 0.0f
+ );
+ }
+
+ // second half
+ ggml_tensor * second;
+ {
+ second = ggml_view_3d(ctx0, cur,
+ n_dim/2, n_head, n_pos,
+ cur->nb[1],
+ cur->nb[2],
+ n_dim/2 * ggml_element_size(cur));
+ second = ggml_rope_ext(
+ ctx0,
+ second,
+ pos_b, // positions
+ nullptr, // freq factors
+ n_dim/2, // n_dims
+ 0, 0, freq_base,
+ freq_scale_odd,
+ 0.0f, 1.0f, 0.0f, 0.0f
+ );
+ }
+
+ cur = ggml_concat(ctx0, first, second, 0);
+ return cur;
+}
+
+// Generic function to stack frames for audio processing
+// Abstracts out the StackAudioFrames logic used by ultravox
+ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
+ if (stack_factor <= 1) {
+ return cur;
+ }
+
+ int64_t total_elements = ggml_nelements(cur);
+ int64_t stride = n_embed * stack_factor;
+
+ // Calculate padded length
+ int64_t padded_len = GGML_PAD(total_elements, stride);
+ int64_t pad = padded_len - total_elements;
+
+ if (pad > 0) {
+ // Pad the tensor to make it divisible by stride
+ cur = ggml_view_1d(ctx0, cur, total_elements, 0);
+ cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
+ }
+
+ // Reshape to [stride, padded_len / stride]
+ cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
+ ggml_row_size(cur->type, stride), 0);
+ return cur;
+}
+
+// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
+// support dynamic resolution
+ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
+ GGML_ASSERT(scale_factor > 1);
+
+ const int n_embd = cur->ne[0];
+ int width = img.nx / patch_size;
+ int height = img.ny / patch_size;
+
+ // pad width and height to factor
+ const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
+ const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
+ cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
+ if (pad_width || pad_height) {
+ cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
+ width += pad_width;
+ height += pad_height;
+ }
+
+ // unshuffle h
+ cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+
+ // unshuffle w
+ cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+
+ cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+ cb(cur, "pixel_shuffle", -1);
+
+ return cur;
+}
+
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
+ GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
+
+ const clip_image_f32 & img = *imgs.entries[0];
+ std::unique_ptr<clip_graph> builder;
+
+ switch (ctx->proj_type()) {
+ case PROJECTOR_TYPE_GEMMA3:
+ case PROJECTOR_TYPE_IDEFICS3:
+ case PROJECTOR_TYPE_LFM2:
+ case PROJECTOR_TYPE_JANUS_PRO:
+ {
+ builder = std::make_unique<clip_graph_siglip>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_GEMMA3NV:
+ {
+ builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_PIXTRAL:
+ case PROJECTOR_TYPE_LIGHTONOCR:
+ {
+ builder = std::make_unique<clip_graph_pixtral>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ {
+ builder = std::make_unique<clip_graph_qwen2vl>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_QWEN3VL:
+ {
+ builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_MINICPMV:
+ {
+ builder = std::make_unique<clip_graph_minicpmv>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_INTERNVL:
+ {
+ builder = std::make_unique<clip_graph_internvl>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_LLAMA4:
+ {
+ builder = std::make_unique<clip_graph_llama4>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_ULTRAVOX:
+ case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_QWEN2A:
+ case PROJECTOR_TYPE_GLMA:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ {
+ builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_KIMIVL:
+ {
+ builder = std::make_unique<clip_graph_kimivl>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_KIMIK25:
+ {
+ builder = std::make_unique<clip_graph_kimik25>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_COGVLM:
+ {
+ builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_MLP:
+ case PROJECTOR_TYPE_MLP_NORM:
+ case PROJECTOR_TYPE_LDP:
+ case PROJECTOR_TYPE_LDPV2:
+ case PROJECTOR_TYPE_GLM_EDGE:
+ {
+ builder = std::make_unique<clip_graph_llava>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_LFM2A:
+ {
+ builder = std::make_unique<clip_graph_conformer>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_GLM4V:
+ {
+ builder = std::make_unique<clip_graph_glm4v>(ctx, img);
+ } break;
+ case PROJECTOR_TYPE_YOUTUVL:
+ {
+ builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
+ } break;
+ default:
+ GGML_ABORT("missing cgraph builder");
+ }
+
+ return builder->build();
+}
+
+//
+// clip_model_loader
+//
+
+struct clip_model_loader {
+ ggml_context_ptr ctx_meta;
+ gguf_context_ptr ctx_gguf;
+
+ std::string fname;
+
+ size_t model_size = 0; // in bytes
+
+ bool has_vision = false;
+ bool has_audio = false;
+
+ // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
+ clip_model_loader(const char * fname) : fname(fname) {
+ struct ggml_context * meta = nullptr;
+
+ struct gguf_init_params params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ &meta,
+ };
+
+ ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params));
+ if (!ctx_gguf.get()) {
+ throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
+ }
+
+ ctx_meta.reset(meta);
+
+ const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
+
+ // print gguf info
+ {
+ std::string name;
+ get_string(KEY_NAME, name, false);
+ std::string description;
+ get_string(KEY_DESCRIPTION, description, false);
+ LOG_INF("%s: model name: %s\n", __func__, name.c_str());
+ LOG_INF("%s: description: %s\n", __func__, description.c_str());
+ LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get()));
+ LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
+ LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
+ LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
+ LOG_INF("\n");
+ }
+
+ // modalities
+ {
+ get_bool(KEY_HAS_VISION_ENC, has_vision, false);
+ get_bool(KEY_HAS_AUDIO_ENC, has_audio, false);
+
+ if (has_vision) {
+ LOG_INF("%s: has vision encoder\n", __func__);
+ }
+ if (has_audio) {
+ LOG_INF("%s: has audio encoder\n", __func__);
+ }
+ }
+
+ // tensors
+ {
+ for (int i = 0; i < n_tensors; ++i) {
+ const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
+ const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
+ enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i);
+ ggml_tensor * cur = ggml_get_tensor(meta, name);
+ size_t tensor_size = ggml_nbytes(cur);
+ model_size += tensor_size;
+ LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+ __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
+ }
+ }
+ }
+
+ void load_hparams(clip_model & model, clip_modality modality) {
+ auto & hparams = model.hparams;
+ std::string log_ffn_op; // for logging
+
+ // sanity check
+ if (modality == CLIP_MODALITY_VISION) {
+ GGML_ASSERT(has_vision);
+ } else if (modality == CLIP_MODALITY_AUDIO) {
+ GGML_ASSERT(has_audio);
+ }
+ model.modality = modality;
+
+
+ // projector type
+ std::string proj_type;
+ {
+ // default key
+ get_string(KEY_PROJ_TYPE, proj_type, false);
+
+ // for models with mixed modalities
+ if (proj_type.empty()) {
+ if (modality == CLIP_MODALITY_VISION) {
+ get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
+ } else if (modality == CLIP_MODALITY_AUDIO) {
+ get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
+ } else {
+ GGML_ABORT("unknown modality");
+ }
+ }
+
+ model.proj_type = clip_projector_type_from_string(proj_type);
+
+ if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
+ throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
+ }
+
+ // correct arch for multimodal models (legacy method)
+ if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
+ model.proj_type = modality == CLIP_MODALITY_VISION
+ ? PROJECTOR_TYPE_QWEN25VL
+ : PROJECTOR_TYPE_QWEN2A;
+ }
+ }
+
+ const bool is_vision = model.modality == CLIP_MODALITY_VISION;
+ const bool is_audio = model.modality == CLIP_MODALITY_AUDIO;
+
+ // other hparams
+ {
+ const char * prefix = is_vision ? "vision" : "audio";
+ get_u32(string_format(KEY_N_EMBD, prefix), hparams.n_embd);
+ get_u32(string_format(KEY_N_HEAD, prefix), hparams.n_head);
+ get_u32(string_format(KEY_N_FF, prefix), hparams.n_ff);
+ get_u32(string_format(KEY_N_BLOCK, prefix), hparams.n_layer);
+ get_u32(string_format(KEY_PROJ_DIM, prefix), hparams.projection_dim);
+ get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
+
+ if (is_vision) {
+ get_u32(KEY_IMAGE_SIZE, hparams.image_size);
+ get_u32(KEY_PATCH_SIZE, hparams.patch_size);
+ get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
+ get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
+ get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
+ if (hparams.minicpmv_query_num == 0) {
+ // Fallback to hardcoded values for legacy models
+ if (hparams.minicpmv_version == 3) {
+ hparams.minicpmv_query_num = 64;
+ } else if (hparams.minicpmv_version == 4) {
+ hparams.minicpmv_query_num = 64;
+ } else if (hparams.minicpmv_version == 5) {
+ hparams.minicpmv_query_num = 64;
+ } else if (hparams.minicpmv_version == 6) {
+ hparams.minicpmv_query_num = 64;
+ } else if (hparams.minicpmv_version == 100045) {
+ hparams.minicpmv_query_num = 64;
+ } else {
+ hparams.minicpmv_query_num = 96;
+ }
+ }
+ } else if (is_audio) {
+ get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
+ // some hparams are unused, but still need to set to avoid issues
+ hparams.image_size = 0;
+ hparams.patch_size = 1;
+
+ } else {
+ GGML_ASSERT(false && "unknown modality");
+ }
+
+ // for pinpoints, we need to convert it into a list of resolution candidates
+ {
+ std::vector<int> pinpoints;
+ get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
+ if (!pinpoints.empty()) {
+ for (size_t i = 0; i < pinpoints.size(); i += 2) {
+ hparams.image_res_candidates.push_back({
+ pinpoints[i],
+ pinpoints[i+1],
+ });
+ }
+ }
+ }
+
+ // default warmup value
+ hparams.warmup_image_size = hparams.image_size;
+
+ hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP
+ || model.proj_type == PROJECTOR_TYPE_MLP_NORM
+ || model.proj_type == PROJECTOR_TYPE_LDP
+ || model.proj_type == PROJECTOR_TYPE_LDPV2;
+
+ {
+ bool use_gelu = false;
+ bool use_silu = false;
+ get_bool(KEY_USE_GELU, use_gelu, false);
+ get_bool(KEY_USE_SILU, use_silu, false);
+ if (use_gelu && use_silu) {
+ throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__));
+ }
+ if (use_gelu) {
+ hparams.ffn_op = FFN_GELU;
+ log_ffn_op = "gelu";
+ } else if (use_silu) {
+ hparams.ffn_op = FFN_SILU;
+ log_ffn_op = "silu";
+ } else {
+ hparams.ffn_op = FFN_GELU_QUICK;
+ log_ffn_op = "gelu_quick";
+ }
+ }
+
+ {
+ std::string mm_patch_merge_type;
+ get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
+ if (mm_patch_merge_type == "spatial_unpad") {
+ hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
+ }
+ }
+
+ if (is_vision) {
+ int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
+ int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
+ GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
+ GGML_ASSERT(idx_std >= 0 && "image_std not found");
+ const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean);
+ const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std);
+ for (int i = 0; i < 3; ++i) {
+ hparams.image_mean[i] = mean_data[i];
+ hparams.image_std[i] = std_data[i];
+ }
+ }
+
+ // Load the vision feature layer indices if they are explicitly provided;
+ // if multiple vision feature layers are present, the values will be concatenated
+ // to form the final visual features.
+ // NOTE: gguf conversions should standardize the values of the vision feature layer to
+ // be non-negative, since we use -1 to mark values as unset here.
+ std::vector<int> vision_feature_layer;
+ get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false);
+ // convert std::vector to std::unordered_set
+ for (auto & layer : vision_feature_layer) {
+ hparams.vision_feature_layer.insert(layer);
+ }
+
+ // model-specific params
+ switch (model.proj_type) {
+ case PROJECTOR_TYPE_MINICPMV:
+ {
+ if (hparams.minicpmv_version == 0) {
+ hparams.minicpmv_version = 2; // default to 2 if not set
+ }
+ } break;
+ case PROJECTOR_TYPE_INTERNVL:
+ {
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+ } break;
+ case PROJECTOR_TYPE_IDEFICS3:
+ {
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+ get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
+ } break;
+ case PROJECTOR_TYPE_LFM2:
+ {
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+ // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
+ hparams.set_limit_image_tokens(64, 256);
+ } break;
+ case PROJECTOR_TYPE_PIXTRAL:
+ case PROJECTOR_TYPE_LIGHTONOCR:
+ {
+ // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
+ // TODO: verify the image_min_tokens
+ hparams.n_merge = 1; // the original pixtral does not use patch merging
+ hparams.rope_theta = 10000.0f;
+ get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+ hparams.set_limit_image_tokens(8, 1024);
+ hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
+ } break;
+ case PROJECTOR_TYPE_KIMIVL:
+ {
+ hparams.rope_theta = 10000.0f;
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+ // TODO: check kimivl preprocessor for exact values
+ hparams.set_limit_image_tokens(8, 1024);
+ hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
+ } break;
+ case PROJECTOR_TYPE_KIMIK25:
+ {
+ hparams.rope_theta = 10000.0f;
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+
+ int min_pixels = 0, max_pixels = 0;
+ get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false);
+ get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false);
+ if (min_pixels > 0 && max_pixels > 0) {
+ hparams.image_min_pixels = min_pixels;
+ hparams.image_max_pixels = max_pixels;
+ hparams.warmup_image_size = static_cast<int>(std::sqrt(max_pixels));
+ } else {
+ hparams.set_limit_image_tokens(2, 4096);
+ }
+ } break;
+ case PROJECTOR_TYPE_GEMMA3:
+ {
+ // default value (used by all model sizes in gemma 3 family)
+ // number of patches for each **side** is reduced by a factor of 4
+ hparams.n_merge = 4;
+ // test model (tinygemma3) has a different value, we optionally read it
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+ } break;
+
+ case PROJECTOR_TYPE_GEMMA3NV:
+ {
+ // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
+ // Similar configuration to Gemma3
+ hparams.n_merge = 1; // MobileNetV5 handles resizing internally
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+ } break;
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_QWEN3VL:
+ {
+ hparams.n_merge = 2; // default value for Qwen 2 and 2.5
+ get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+ get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
+ // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
+ hparams.set_limit_image_tokens(8, 4096);
+ hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
+ const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
+ if (hparams.image_min_pixels < warn_min_pixels) {
+ LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
+ LOG_WRN("%s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n", __func__);
+ LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
+ }
+ } break;
+ case PROJECTOR_TYPE_YOUTUVL:
+ {
+ hparams.n_merge = 2;
+ get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+ get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
+ std::vector<int> wa_layer_indexes_vec;
+ get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
+ for (auto & layer : wa_layer_indexes_vec) {
+ hparams.wa_layer_indexes.insert(layer);
+ }
+ // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
+ hparams.set_limit_image_tokens(1, 62500);
+ hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
+ } break;
+ case PROJECTOR_TYPE_GLM4V:
+ {
+ hparams.rope_theta = 10000.0f;
+ hparams.n_merge = 2; // default value for GLM4-V
+ get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+ hparams.set_limit_image_tokens(8, 4096);
+ hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
+ } break;
+ case PROJECTOR_TYPE_LLAMA4:
+ {
+ hparams.rope_theta = 10000.0f;
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+ set_llava_uhd_res_candidates(model, 3);
+ } break;
+ case PROJECTOR_TYPE_ULTRAVOX:
+ case PROJECTOR_TYPE_QWEN2A:
+ case PROJECTOR_TYPE_GLMA:
+ case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ {
+ bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
+ model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
+ model.proj_type == PROJECTOR_TYPE_GLMA;
+ get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
+ hparams.ffn_op = FFN_GELU_ERF;
+ log_ffn_op = "gelu_erf"; // temporary solution for logging
+
+ // audio preprocessing params
+ hparams.audio_chunk_len = 30; // in seconds
+ hparams.audio_sample_rate = 16000;
+ hparams.audio_n_fft = 400;
+ hparams.audio_window_len = 400;
+ hparams.audio_hop_len = 160;
+ } break;
+ case PROJECTOR_TYPE_LFM2A:
+ {
+ // audio preprocessing params
+ hparams.audio_chunk_len = 1; // in seconds
+ hparams.audio_sample_rate = 16000;
+ hparams.audio_n_fft = 512;
+ hparams.audio_window_len = 400;
+ hparams.audio_hop_len = 160;
+ } break;
+ default:
+ break;
+ }
+
+ // sanity check
+ {
+ if (hparams.image_max_pixels < hparams.image_min_pixels) {
+ throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
+ }
+ }
+
+ LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
+ LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
+ LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
+ LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
+ LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
+ LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
+ LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
+ if (is_vision) {
+ LOG_INF("\n--- vision hparams ---\n");
+ LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
+ LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
+ LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
+ LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
+ LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
+ LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
+ if (!hparams.wa_layer_indexes.empty()) {
+ LOG_INF("%s: wa_layer_indexes: ", __func__);
+ for (auto & layer : hparams.wa_layer_indexes) {
+ LOG_INF("%d ", layer);
+ }
+ LOG_INF("\n");
+ }
+ if (hparams.image_min_pixels > 0) {
+ LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
+ }
+ if (hparams.image_max_pixels > 0) {
+ LOG_INF("%s: image_max_pixels: %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : "");
+ }
+ } else if (is_audio) {
+ LOG_INF("\n--- audio hparams ---\n");
+ LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
+ LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
+ LOG_INF("%s: audio_chunk_len: %d\n", __func__, hparams.audio_chunk_len);
+ LOG_INF("%s: audio_sample_rate: %d\n", __func__, hparams.audio_sample_rate);
+ LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft);
+ LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len);
+ LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len);
+ }
+ LOG_INF("\n");
+ LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
+ LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
+ }
+ }
+
+ void load_tensors(clip_ctx & ctx_clip) {
+ auto & model = ctx_clip.model;
+ auto & hparams = model.hparams;
+ std::map<std::string, size_t> tensor_offset;
+ std::vector<ggml_tensor *> tensors_to_load;
+
+ // TODO @ngxson : support both audio and video in the future
+ const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
+
+ // get offsets
+ for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
+ const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
+ tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i);
+ }
+
+ // create data context
+ struct ggml_init_params params = {
+ /*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ ctx_clip.ctx_data.reset(ggml_init(params));
+ if (!ctx_clip.ctx_data) {
+ throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__));
+ }
+
+ // helper function
+ auto get_tensor = [&](const std::string & name, bool required = true) {
+ ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
+ if (!cur && required) {
+ throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
+ }
+ if (cur) {
+ tensors_to_load.push_back(cur);
+ // add tensors to context
+ ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
+ ggml_set_name(data_tensor, cur->name);
+ cur = data_tensor;
+ }
+ return cur;
+ };
+
+ model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
+
+ model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
+ model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false);
+
+ model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
+ model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false);
+
+ model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
+ model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
+ model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
+
+ model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false);
+ model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false);
+
+ model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
+
+ if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
+ hparams.n_layer = 0; // gemma3n does not use normal layer structure
+ }
+
+ // layers
+ model.layers.resize(hparams.n_layer);
+ for (int il = 0; il < hparams.n_layer; ++il) {
+ auto & layer = model.layers[il];
+ layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false);
+ layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false);
+ layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false);
+ layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
+ layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false);
+ layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
+ layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
+ layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
+ layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
+ layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
+ layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
+
+ layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
+ layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
+ layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false);
+ layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
+ layer.qkv_b = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "bias"), false);
+ layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false);
+ layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);
+
+ // ffn
+ layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight"));
+ layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias"), false);
+ layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
+ layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false);
+ layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
+ layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);
+
+
+ // qwen3vl deepstack layer
+ layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
+ layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false);
+ layer.deepstack_fc1_w = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "weight"), false);
+ layer.deepstack_fc1_b = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "bias"), false);
+ layer.deepstack_fc2_w = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "weight"), false);
+ layer.deepstack_fc2_b = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "bias"), false);
+ if (layer.has_deepstack()) {
+ model.n_deepstack_layers++;
+ }
+
+ // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
+ // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
+ bool is_ffn_swapped = (
+ // only old models need this fix
+ model.proj_type == PROJECTOR_TYPE_MLP
+ || model.proj_type == PROJECTOR_TYPE_MLP_NORM
+ || model.proj_type == PROJECTOR_TYPE_LDP
+ || model.proj_type == PROJECTOR_TYPE_LDPV2
+ || model.proj_type == PROJECTOR_TYPE_QWEN2VL
+ || model.proj_type == PROJECTOR_TYPE_QWEN25VL
+ || model.proj_type == PROJECTOR_TYPE_GLM_EDGE
+ || model.proj_type == PROJECTOR_TYPE_GEMMA3
+ || model.proj_type == PROJECTOR_TYPE_IDEFICS3
+ || model.proj_type == PROJECTOR_TYPE_MINICPMV
+ ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
+ if (is_ffn_swapped) {
+ // swap up and down weights
+ ggml_tensor * tmp = layer.ff_up_w;
+ layer.ff_up_w = layer.ff_down_w;
+ layer.ff_down_w = tmp;
+ // swap up and down biases
+ tmp = layer.ff_up_b;
+ layer.ff_up_b = layer.ff_down_b;
+ layer.ff_down_b = tmp;
+ if (il == 0) {
+ LOG_WRN("%s: ffn up/down are swapped\n", __func__);
+ }
+ }
+ }
+
+
+ switch (model.proj_type) {
+ case PROJECTOR_TYPE_MLP:
+ case PROJECTOR_TYPE_MLP_NORM:
+ {
+ // LLaVA projection
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
+ // Yi-type llava
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
+ // missing in Yi-type llava
+ model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
+ model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
+ // Yi-type llava
+ model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
+ model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
+ model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
+ model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
+ if (model.mm_3_w) {
+ // TODO: this is a hack to support Yi-type llava
+ model.proj_type = PROJECTOR_TYPE_MLP_NORM;
+ }
+ model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
+ } break;
+ case PROJECTOR_TYPE_LDP:
+ {
+ // MobileVLM projection
+ model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+ model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
+ model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
+ model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
+ model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
+ model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
+ model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
+ model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
+ model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
+ model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
+ model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
+ model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
+ model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
+ model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
+ model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
+ model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
+ model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
+ model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
+ model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
+ model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
+ model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
+ model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
+ model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
+ model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
+ } break;
+ case PROJECTOR_TYPE_LDPV2:
+ {
+ // MobilVLM_V2 projection
+ model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
+ model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
+ model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
+ model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
+ model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
+ model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
+ } break;
+ case PROJECTOR_TYPE_MINICPMV:
+ {
+ // model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
+ model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
+ model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
+ model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
+ model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
+ model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
+ model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
+ model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
+ model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
+ model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
+ model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
+ model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
+ model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
+ model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
+ model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
+ model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
+ model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
+ model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
+ model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
+ } break;
+ case PROJECTOR_TYPE_GLM_EDGE:
+ {
+ model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
+ model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
+ model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
+ model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
+ model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
+ model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
+ model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
+ model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
+ model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
+ model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
+ } break;
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ {
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+ } break;
+ case PROJECTOR_TYPE_QWEN3VL:
+ {
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+ } break;
+ case PROJECTOR_TYPE_YOUTUVL:
+ {
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+ } break;
+ case PROJECTOR_TYPE_GLM4V:
+ {
+ model.projection = get_tensor(TN_MM_PROJECTOR);
+ model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight"));
+ model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false);
+ model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
+ model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false);
+ model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight"));
+ model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false);
+ model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
+ model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
+ model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
+ model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
+ } break;
+ case PROJECTOR_TYPE_GEMMA3:
+ {
+ model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
+ model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
+ } break;
+ case PROJECTOR_TYPE_GEMMA3NV:
+ {
+ model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
+ model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
+ model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
+
+ model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
+ model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
+ model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
+ model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
+
+ model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
+
+ // Dynamically load blocks stage by stage
+ for (int stage = 0; stage < 4; ++stage) {
+ int blocks_found_in_stage = 0;
+
+ for (int blk_idx = 0; ; ++blk_idx) {
+ bool found_block = false;
+ mobilenetv5_block block;
+
+ // 1. Check for Edge Residual (S0)
+ block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
+ if (block.s0_conv_exp_w) {
+ found_block = true;
+ block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
+ block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
+ block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
+ }
+ // 2. Check for UIR (Universal Inverted Residual)
+ else {
+ // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
+ block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
+ block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
+
+ if (block.dw_start_w || block.pw_exp_w) {
+ found_block = true;
+ if (block.dw_start_w) {
+ block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
+ }
+ if (block.pw_exp_w) {
+ block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
+ }
+ block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
+ if (block.dw_mid_w) {
+ block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
+ }
+ block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
+ if (block.pw_proj_w) {
+ block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
+ }
+ block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
+ }
+ }
+
+ // 3. Check for Attention (MQA)
+ // Even if UIR/Edge check failed, this might be a pure attention block
+ ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
+ if (attn_q_check) {
+ found_block = true;
+ block.attn_q_w = attn_q_check;
+ block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
+ block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
+ block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
+ block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
+ block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
+ block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
+ block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
+ block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
+ // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
+ if (!block.layer_scale_w) {
+ block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
+ }
+ }
+
+ if (found_block) {
+ model.mobilenet_blocks.push_back(block);
+ blocks_found_in_stage++;
+ } else {
+ // End of blocks for this stage
+ break;
+ }
+ }
+
+ // Track where this stage ends in the flat vector
+ if (blocks_found_in_stage > 0) {
+ model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
+ LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
+ }
+ }
+ model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
+ model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
+ } break;
+ case PROJECTOR_TYPE_IDEFICS3:
+ {
+ model.projection = get_tensor(TN_MM_PROJECTOR);
+ } break;
+ case PROJECTOR_TYPE_LFM2:
+ {
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
+ model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
+ model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+ model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+ } break;
+ case PROJECTOR_TYPE_KIMIVL:
+ case PROJECTOR_TYPE_KIMIK25:
+ {
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
+ model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
+ model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+ model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+ } break;
+ case PROJECTOR_TYPE_PIXTRAL:
+ {
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
+ model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+ model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
+ // [IMG_BREAK] token embedding
+ model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
+ // for mistral small 3.1
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
+ model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
+ } break;
+ case PROJECTOR_TYPE_LIGHTONOCR:
+ {
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
+ model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+ model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
+ model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
+ } break;
+ case PROJECTOR_TYPE_ULTRAVOX:
+ {
+ model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+ model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+ model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+ model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+ model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+ model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
+ model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
+ } break;
+ case PROJECTOR_TYPE_QWEN2A:
+ {
+ model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+ model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+ model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+ model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+ model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
+ model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
+ } break;
+ case PROJECTOR_TYPE_VOXTRAL:
+ {
+ model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+ model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+ model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+ model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+ model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+ } break;
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ {
+ model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+ model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+ model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+ model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
+ model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+ model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
+ } break;
+ case PROJECTOR_TYPE_INTERNVL:
+ {
+ model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
+ model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
+ model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
+ model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
+ } break;
+ case PROJECTOR_TYPE_GLMA:
+ {
+ model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+ model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+ model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+ model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
+ model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+ model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
+ model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
+ model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
+ model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
+ model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
+ } break;
+ case PROJECTOR_TYPE_LLAMA4:
+ {
+ model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
+ model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+ model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
+ } break;
+ case PROJECTOR_TYPE_COGVLM:
+ {
+ model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
+ model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight"));
+ model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias"));
+ model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight"));
+ model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
+ model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight"));
+ model.mm_boi = get_tensor(TN_TOK_BOI);
+ model.mm_eoi = get_tensor(TN_TOK_EOI);
+ } break;
+ case PROJECTOR_TYPE_JANUS_PRO:
+ {
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
+ } break;
+ case PROJECTOR_TYPE_LFM2A:
+ {
+ for (int i : {0, 2, 3, 5, 6}) {
+ model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
+ model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
+ }
+ model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
+ model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
+
+ model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
+ model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
+ model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
+ model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
+
+ for (int il = 0; il < hparams.n_layer; ++il) {
+ auto & layer = model.layers[il];
+
+ layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight"));
+ layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias"));
+ layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
+ layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
+ layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight"));
+ layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias"));
+ layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
+ layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
+
+ layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
+ layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
+
+ layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
+ layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
+
+ layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
+
+ layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
+ layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
+ layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight"));
+ layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias"));
+ layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight"));
+ layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"));
+ layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight"));
+ layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"));
+ }
+ } break;
+ default:
+ GGML_ASSERT(false && "unknown projector type");
+ }
+
+ // load data
+ {
+ std::vector<uint8_t> read_buf;
+
+ auto fin = std::ifstream(fname, std::ios::binary);
+ if (!fin) {
+ throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
+ }
+
+ // alloc memory and offload data
+ ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
+ ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
+ ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+ for (auto & t : tensors_to_load) {
+ ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
+ const size_t offset = tensor_offset[t->name];
+ fin.seekg(offset, std::ios::beg);
+ if (!fin) {
+ throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
+ }
+ size_t num_bytes = ggml_nbytes(cur);
+ if (ggml_backend_buft_is_host(buft)) {
+ // for the CPU and Metal backend, we can read directly into the tensor
+ fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
+ } else {
+ // read into a temporary buffer first, then copy to device memory
+ read_buf.resize(num_bytes);
+ fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+ }
+ }
+ fin.close();
+
+ LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
+ }
+ }
+
+ struct support_info_op {
+ ggml_tensor * op;
+
+ // true if the op runs on the accelerated ctx_clip.backend
+ bool is_accel = true;
+ };
+
+ struct support_info_graph {
+ // whether the clip_ctx.backend supports flash attention
+ bool fattn = true;
+ ggml_tensor * fattn_op = nullptr; // for debugging
+
+ std::vector<support_info_op> ops;
+ };
+
+ static void warmup(clip_ctx & ctx_clip) {
+ // create a fake batch
+ const auto & hparams = ctx_clip.model.hparams;
+ clip_image_f32_batch batch;
+ clip_image_f32_ptr img(clip_image_f32_init());
+ if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
+ img->nx = hparams.warmup_image_size;
+ img->ny = hparams.warmup_image_size;
+ LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
+ } else {
+ img->nx = hparams.warmup_audio_size;
+ img->ny = hparams.n_mel_bins;
+ LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
+ }
+ batch.entries.push_back(std::move(img));
+ warmup(ctx_clip, batch);
+ }
+
+ static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
+ support_info_graph info;
+
+ if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
+ // try to enable flash attention to see if it's supported
+ ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
+ info = alloc_compute_meta(ctx_clip, batch);
+ if (!info.fattn && info.fattn_op) {
+ auto op = info.fattn_op;
+ LOG_WRN("%s: *****************************************************************\n", __func__);
+ LOG_WRN("%s: WARNING: flash attention not supported by %s, memory usage will increase\n", __func__, ggml_backend_name(ctx_clip.backend));
+ LOG_WRN("%s: op params: \n", __func__);
+ static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) {
+ LOG_WRN("%s: %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n", fn,
+ name, ggml_type_name(t->type),
+ t->ne[0], t->ne[1], t->ne[2], t->ne[3],
+ t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+ };
+ print_shape(__func__, " dst", op);
+ print_shape(__func__, "src0", op->src[0]);
+ print_shape(__func__, "src1", op->src[1]);
+ print_shape(__func__, "src2", op->src[2]);
+ LOG_WRN("%s: please report this on github as an issue\n", __func__);
+ LOG_WRN("%s: *****************************************************************\n", __func__);
+ ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
+ alloc_compute_meta(ctx_clip, batch);
+ }
+ } else {
+ info = alloc_compute_meta(ctx_clip, batch);
+ if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+ LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
+ }
+ }
+
+ ctx_clip.is_allocated = true; // mark buffers as allocated
+
+ LOG_INF("%s: flash attention is %s\n", __func__,
+ (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
+
+ // print ops that are not supported by the GPU backend (if there is one)
+ if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
+ std::vector<support_info_op> unsupported_ops;
+ for (const auto & op : info.ops) {
+ if (!op.is_accel) {
+ unsupported_ops.push_back(op);
+ }
+ }
+ if (!unsupported_ops.empty()) {
+ LOG_WRN("%s: *****************************************************************\n", __func__);
+ LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
+ LOG_WRN("%s: the performance will be suboptimal \n", __func__);
+ LOG_WRN("%s: list of unsupported ops (backend=%s):\n", __func__, ggml_backend_name(ctx_clip.backend));
+ for (const auto & op : unsupported_ops) {
+ LOG_WRN("%s: %16s: type = %s, ne = [%d %d %d %d]\n", __func__,
+ ggml_op_name(op.op->op),
+ ggml_type_name(op.op->type),
+ op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]);
+ }
+ LOG_WRN("%s: flash attention is %s\n", __func__,
+ (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
+ LOG_WRN("%s: please report this on github as an issue\n", __func__);
+ LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);
+ LOG_WRN("%s: *****************************************************************\n", __func__);
+ }
+ }
+ }
+
+ static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
+ ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
+
+ ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
+ ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
+
+ for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
+ ggml_backend_t backend = ctx_clip.backend_ptrs[i];
+ ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
+ size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend);
+ if (size > 1) {
+ LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+ ggml_backend_buft_name(buft),
+ size / 1024.0 / 1024.0);
+ }
+ }
+
+ const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get());
+ const int n_nodes = ggml_graph_n_nodes(gf);
+
+ LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes);
+
+ support_info_graph res {
+ /*.fattn = */ true,
+ /*.fattn_op = */ nullptr,
+ /*.ops = */ {},
+ };
+
+ // check op support
+ for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+ ggml_tensor * node = ggml_graph_node(gf, i);
+ res.ops.push_back({node, true});
+ if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
+ res.ops.back().is_accel = false;
+ if (node->op == GGML_OP_FLASH_ATTN_EXT) {
+ res.fattn = false;
+ res.fattn_op = node;
+ }
+ }
+ }
+
+ return res;
+ }
+
+ void get_bool(const std::string & key, bool & output, bool required = true) const {
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+ if (i < 0) {
+ if (required) {
+ throw std::runtime_error("Key not found: " + key);
+ }
+ return;
+ }
+ output = gguf_get_val_bool(ctx_gguf.get(), i);
+ }
+
+ void get_i32(const std::string & key, int & output, bool required = true) const {
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+ if (i < 0) {
+ if (required) {
+ throw std::runtime_error("Key not found: " + key);
+ }
+ return;
+ }
+ output = gguf_get_val_i32(ctx_gguf.get(), i);
+ }
+
+ void get_u32(const std::string & key, int & output, bool required = true) const {
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+ if (i < 0) {
+ if (required) {
+ throw std::runtime_error("Key not found: " + key);
+ }
+ return;
+ }
+ output = gguf_get_val_u32(ctx_gguf.get(), i);
+ }
+
+ void get_f32(const std::string & key, float & output, bool required = true) const {
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+ if (i < 0) {
+ if (required) {
+ throw std::runtime_error("Key not found: " + key);
+ }
+ return;
+ }
+ output = gguf_get_val_f32(ctx_gguf.get(), i);
+ }
+
+ void get_string(const std::string & key, std::string & output, bool required = true) const {
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+ if (i < 0) {
+ if (required) {
+ throw std::runtime_error("Key not found: " + key);
+ }
+ return;
+ }
+ output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
+ }
+
+ void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const {
+ const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+ if (i < 0) {
+ if (required) {
+ throw std::runtime_error("Key not found: " + key);
+ }
+ return;
+ }
+ int n = gguf_get_arr_n(ctx_gguf.get(), i);
+ output.resize(n);
+ const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i);
+ for (int i = 0; i < n; ++i) {
+ output[i] = values[i];
+ }
+ }
+
+ static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
+ auto & hparams = model.hparams;
+ for (int x = 1; x <= max_patches_per_side; x++) {
+ for (int y = 1; y <= max_patches_per_side; y++) {
+ if (x == 1 && y == 1) {
+ continue; // skip the first point
+ }
+ hparams.image_res_candidates.push_back(clip_image_size{
+ x*hparams.image_size,
+ y*hparams.image_size,
+ });
+ }
+ }
+ }
+};
+
+struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
+ clip_ctx * ctx_vision = nullptr;
+ clip_ctx * ctx_audio = nullptr;
+
+ try {
+ clip_model_loader loader(fname);
+ bool skip_audio = false;
+
+ if (loader.has_vision) {
+ ctx_vision = new clip_ctx(ctx_params);
+ loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
+ loader.load_tensors(*ctx_vision);
+ if (ctx_params.warmup) {
+ loader.warmup(*ctx_vision);
+ }
+
+ // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
+ // we can remove this check when we implement audio support for Gemma 3N
+ skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
+
+ // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
+ }
+
+ if (loader.has_audio && !skip_audio) {
+ ctx_audio = new clip_ctx(ctx_params);
+ loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
+ loader.load_tensors(*ctx_audio);
+ if (ctx_params.warmup) {
+ loader.warmup(*ctx_audio);
+ }
+ }
+
+ } catch (const std::exception & e) {
+ LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
+
+ delete ctx_vision;
+ delete ctx_audio;
+
+ return {nullptr, nullptr};
+ }
+
+ return {ctx_vision, ctx_audio};
+}
+
+struct clip_image_size * clip_image_size_init() {
+ struct clip_image_size * load_image_size = new struct clip_image_size();
+ load_image_size->width = 448;
+ load_image_size->height = 448;
+ return load_image_size;
+}
+
+struct clip_image_u8 * clip_image_u8_init() {
+ return new clip_image_u8();
+}
+
+struct clip_image_f32 * clip_image_f32_init() {
+ return new clip_image_f32();
+}
+
+struct clip_image_f32_batch * clip_image_f32_batch_init() {
+ return new clip_image_f32_batch();
+}
+
+unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
+ if (nx) *nx = img->nx;
+ if (ny) *ny = img->ny;
+ return img->buf.data();
+}
+
+void clip_image_size_free(struct clip_image_size * load_image_size) {
+ if (load_image_size == nullptr) {
+ return;
+ }
+ delete load_image_size;
+}
+void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
+void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
+void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
+void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }
+
+size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
+ return batch->entries.size();
+}
+
+size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
+ if (idx < 0 || idx >= (int)batch->entries.size()) {
+ LOG_ERR("%s: invalid index %d\n", __func__, idx);
+ return 0;
+ }
+ return batch->entries[idx]->nx;
+}
+
+size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
+ if (idx < 0 || idx >= (int)batch->entries.size()) {
+ LOG_ERR("%s: invalid index %d\n", __func__, idx);
+ return 0;
+ }
+ return batch->entries[idx]->ny;
+}
+
+clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
+ if (idx < 0 || idx >= (int)batch->entries.size()) {
+ LOG_ERR("%s: invalid index %d\n", __func__, idx);
+ return nullptr;
+ }
+ return batch->entries[idx].get();
+}
+
+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
+ img->nx = nx;
+ img->ny = ny;
+ img->buf.resize(3 * nx * ny);
+ memcpy(img->buf.data(), rgb_pixels, img->buf.size());
+}
+
+// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
+static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
+ dst.nx = src.nx;
+ dst.ny = src.ny;
+ dst.buf.resize(src.buf.size());
+
+ // TODO @ngxson : seems like this could be done more efficiently on cgraph
+ for (size_t i = 0; i < src.buf.size(); ++i) {
+ int c = i % 3; // rgb
+ dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
+ }
+}
+
+// set of tools to manupulate images
+// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
+struct img_tool {
+ enum resize_algo {
+ RESIZE_ALGO_BILINEAR,
+ RESIZE_ALGO_BICUBIC,
+ // RESIZE_ALGO_LANCZOS, // TODO
+ };
+
+ static void resize(
+ const clip_image_u8 & src,
+ clip_image_u8 & dst,
+ const clip_image_size & target_resolution,
+ resize_algo algo,
+ bool add_padding = true, // TODO: define the behavior for add_padding = false
+ std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
+ dst.nx = target_resolution.width;
+ dst.ny = target_resolution.height;
+ dst.buf.resize(3 * dst.nx * dst.ny);
+
+ if (dst.nx == src.nx && dst.ny == src.ny) {
+ // no resize needed, simple copy
+ dst.buf = src.buf;
+ return;
+ }
+
+ if (!add_padding) {
+ // direct resize
+ switch (algo) {
+ case RESIZE_ALGO_BILINEAR:
+ resize_bilinear(src, dst, target_resolution.width, target_resolution.height);
+ break;
+ case RESIZE_ALGO_BICUBIC:
+ resize_bicubic(src, dst, target_resolution.width, target_resolution.height);
+ break;
+ default:
+ throw std::runtime_error("Unsupported resize algorithm");
+ }
+ } else {
+ // resize with padding
+ clip_image_u8 resized_image;
+ float scale_w = static_cast<float>(target_resolution.width) / src.nx;
+ float scale_h = static_cast<float>(target_resolution.height) / src.ny;
+ float scale = std::min(scale_w, scale_h);
+ int new_width = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
+ int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+
+ switch (algo) {
+ case RESIZE_ALGO_BILINEAR:
+ resize_bilinear(src, resized_image, new_width, new_height);
+ break;
+ case RESIZE_ALGO_BICUBIC:
+ resize_bicubic(src, resized_image, new_width, new_height);
+ break;
+ default:
+ throw std::runtime_error("Unsupported resize algorithm");
+ }
+
+ // fill dst with pad_color
+ fill(dst, pad_color);
+
+ int offset_x = (target_resolution.width - new_width) / 2;
+ int offset_y = (target_resolution.height - new_height) / 2;
+
+ composite(dst, resized_image, offset_x, offset_y);
+ }
+ }
+
+ static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
+ dst.nx = w;
+ dst.ny = h;
+ dst.buf.resize(3 * w * h);
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int src_idx = 3 * ((y + i)*image.nx + (x + j));
+ int dst_idx = 3 * (i*w + j);
+ dst.buf[dst_idx] = image.buf[src_idx];
+ dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
+ dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
+ }
+ }
+ }
+
+ // calculate the size of the **resized** image, while preserving the aspect ratio
+ // the calculated size will be aligned to the nearest multiple of align_size
+ // if H or W size is larger than longest_edge, it will be resized to longest_edge
+ static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
+ GGML_ASSERT(align_size > 0);
+ if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) {
+ return {0, 0};
+ }
+
+ float scale = std::min(static_cast<float>(longest_edge) / inp_size.width,
+ static_cast<float>(longest_edge) / inp_size.height);
+
+ float target_width_f = static_cast<float>(inp_size.width) * scale;
+ float target_height_f = static_cast<float>(inp_size.height) * scale;
+
+ auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
+ int aligned_width = ceil_by_factor(target_width_f);
+ int aligned_height = ceil_by_factor(target_height_f);
+
+ return {aligned_width, aligned_height};
+ }
+
+ // calculate the size of the **resized** image, while preserving the aspect ratio
+ // the calculated size will have min_pixels <= W*H <= max_pixels
+ // this is referred as "smart_resize" in transformers code
+ static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
+ GGML_ASSERT(align_size > 0);
+ const int width = inp_size.width;
+ const int height = inp_size.height;
+
+ auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::round(x / static_cast<float>(f))) * f; };
+ auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
+ auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
+
+ // always align up first
+ int h_bar = std::max(align_size, round_by_factor(height));
+ int w_bar = std::max(align_size, round_by_factor(width));
+
+ if (h_bar * w_bar > max_pixels) {
+ const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
+ h_bar = std::max(align_size, floor_by_factor(height / beta));
+ w_bar = std::max(align_size, floor_by_factor(width / beta));
+ } else if (h_bar * w_bar < min_pixels) {
+ const auto beta = std::sqrt(static_cast<float>(min_pixels) / (height * width));
+ h_bar = ceil_by_factor(height * beta);
+ w_bar = ceil_by_factor(width * beta);
+ }
+
+ return {w_bar, h_bar};
+ }
+
+ // draw src image into dst image at offset (offset_x, offset_y)
+ static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
+ for (int y = 0; y < src.ny; ++y) {
+ for (int x = 0; x < src.nx; ++x) {
+ int dx = x + offset_x;
+ int dy = y + offset_y;
+ // skip pixels that would be out of bounds in the destination
+ if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
+ continue;
+ }
+ size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
+ size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
+ dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
+ dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
+ dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
+ }
+ }
+ }
+
+ // fill the image with a solid color
+ static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
+ for (size_t i = 0; i < img.buf.size(); i += 3) {
+ img.buf[i] = color[0];
+ img.buf[i + 1] = color[1];
+ img.buf[i + 2] = color[2];
+ }
+ }
+
+private:
+ // Bilinear resize function
+ static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
+ dst.nx = target_width;
+ dst.ny = target_height;
+ dst.buf.resize(3 * target_width * target_height);
+
+ float x_ratio = static_cast<float>(src.nx - 1) / target_width;
+ float y_ratio = static_cast<float>(src.ny - 1) / target_height;
+
+ for (int y = 0; y < target_height; y++) {
+ for (int x = 0; x < target_width; x++) {
+ float px = x_ratio * x;
+ float py = y_ratio * y;
+ int x_floor = static_cast<int>(px);
+ int y_floor = static_cast<int>(py);
+ float x_lerp = px - x_floor;
+ float y_lerp = py - y_floor;
+
+ for (int c = 0; c < 3; c++) {
+ float top = lerp(
+ static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
+ static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
+ x_lerp
+ );
+ float bottom = lerp(
+ static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
+ static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
+ x_lerp
+ );
+ dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
+ }
+ }
+ }
+ }
+
+ // Bicubic resize function
+ // part of image will be cropped if the aspect ratio is different
+ static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
+ const int nx = img.nx;
+ const int ny = img.ny;
+
+ dst.nx = target_width;
+ dst.ny = target_height;
+ dst.buf.resize(3 * target_width * target_height);
+
+ float Cc;
+ float C[5] = {};
+ float d0, d2, d3, a0, a1, a2, a3;
+ int i, j, k, jj;
+ int x, y;
+ float dx, dy;
+ float tx, ty;
+
+ tx = (float)nx / (float)target_width;
+ ty = (float)ny / (float)target_height;
+
+ // Bicubic interpolation; adapted from ViT.cpp, inspired from :
+ // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
+ // -> https://en.wikipedia.org/wiki/Bicubic_interpolation
+
+ for (i = 0; i < target_height; i++) {
+ for (j = 0; j < target_width; j++) {
+ x = (int)(tx * j);
+ y = (int)(ty * i);
+
+ dx = tx * j - x;
+ dy = ty * i - y;
+
+ for (k = 0; k < 3; k++) {
+ for (jj = 0; jj <= 3; jj++) {
+ d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+ d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+ d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+ a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+
+ a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+ a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
+ a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+
+ C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
+
+ d0 = C[0] - C[1];
+ d2 = C[2] - C[1];
+ d3 = C[3] - C[1];
+ a0 = C[1];
+ a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+ a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
+ a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+ Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
+
+ const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
+ dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
+ }
+ }
+ }
+ }
+
+ return true;
+ }
+
+ static inline int clip(int x, int lower, int upper) {
+ return std::max(lower, std::min(x, upper));
+ }
+
+ // Linear interpolation between two points
+ static inline float lerp(float s, float e, float t) {
+ return s + (e - s) * t;
+ }
+};
+
+/**
+ * implementation of LLaVA-UHD:
+ * - https://arxiv.org/pdf/2403.11703
+ * - https://github.com/thunlp/LLaVA-UHD
+ * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
+ *
+ * overview:
+ * - an image always have a single overview (downscaled image)
+ * - an image can have 0 or multiple slices, depending on the image size
+ * - each slice can then be considered as a separate image
+ *
+ * for example:
+ *
+ * [overview] --> [slice 1] --> [slice 2]
+ * | |
+ * +--> [slice 3] --> [slice 4]
+ */
+struct llava_uhd {
+ struct slice_coordinates {
+ int x;
+ int y;
+ clip_image_size size;
+ };
+
+ struct slice_instructions {
+ clip_image_size overview_size; // size of downscaled image
+ clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
+ clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
+ std::vector<slice_coordinates> slices;
+
+ img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
+ bool padding_overview = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
+ std::array<uint8_t, 3> pad_color_overview = {0, 0, 0};
+
+ img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC;
+ bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
+ std::array<uint8_t, 3> pad_color_refined = {0, 0, 0};
+ };
+
+ static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
+ slice_instructions res;
+ const int patch_size = clip_get_patch_size(ctx);
+ const int slice_size = clip_get_image_size(ctx);
+ const int original_width = original_size.width;
+ const int original_height = original_size.height;
+
+ const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
+ const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
+
+ if (!has_slices) {
+ // skip slicing logic
+ res.overview_size = clip_image_size{slice_size, slice_size};
+ res.refined_size = clip_image_size{0, 0};
+ res.grid_size = clip_image_size{0, 0};
+
+ return res;
+ }
+
+ if (has_pinpoints) {
+ // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
+ auto refine_size = llava_uhd::select_best_resolution(
+ original_size,
+ ctx->model.hparams.image_res_candidates);
+ res.overview_size = clip_image_size{slice_size, slice_size};
+ res.refined_size = refine_size;
+ res.grid_size = clip_image_size{0, 0};
+ res.padding_refined = true;
+ res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; // preserve old behavior when padding
+
+ LOG_DBG("%s: using pinpoints for slicing\n", __func__);
+ LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
+ __func__, original_width, original_height,
+ res.overview_size.width, res.overview_size.height,
+ res.refined_size.width, res.refined_size.height);
+
+ for (int y = 0; y < refine_size.height; y += slice_size) {
+ for (int x = 0; x < refine_size.width; x += slice_size) {
+ slice_coordinates slice;
+ slice.x = x;
+ slice.y = y;
+ slice.size.width = std::min(slice_size, refine_size.width - x);
+ slice.size.height = std::min(slice_size, refine_size.height - y);
+ res.slices.push_back(slice);
+ LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+ __func__, (int)res.slices.size() - 1,
+ slice.x, slice.y, slice.size.width, slice.size.height);
+ }
+ }
+
+ res.grid_size.height = refine_size.height / slice_size;
+ res.grid_size.width = refine_size.width / slice_size;
+ LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
+
+ return res;
+ }
+
+ // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
+
+ auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
+ res.overview_size = best_size;
+
+ {
+ const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
+ const float log_ratio = log((float)original_width / original_height);
+ const float ratio = (float)original_width * original_height / (slice_size * slice_size);
+ const int multiple = fmin(ceil(ratio), max_slice_nums);
+
+ auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
+ auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
+ res.grid_size = best_grid;
+ res.refined_size = refine_size;
+
+ LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+ __func__, original_width, original_height,
+ res.overview_size.width, res.overview_size.height,
+ res.refined_size.width, res.refined_size.height,
+ res.grid_size.width, res.grid_size.height);
+
+ int width = refine_size.width;
+ int height = refine_size.height;
+ int grid_x = int(width / best_grid.width);
+ int grid_y = int(height / best_grid.height);
+ for (int patches_y = 0, ic = 0;
+ patches_y < refine_size.height && ic < best_grid.height;
+ patches_y += grid_y, ic += 1) {
+ for (int patches_x = 0, jc = 0;
+ patches_x < refine_size.width && jc < best_grid.width;
+ patches_x += grid_x, jc += 1) {
+ slice_coordinates slice;
+ slice.x = patches_x;
+ slice.y = patches_y;
+ slice.size.width = grid_x;
+ slice.size.height = grid_y;
+ res.slices.push_back(slice);
+ LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+ __func__, (int)res.slices.size() - 1,
+ slice.x, slice.y, slice.size.width, slice.size.height);
+ }
+ }
+ }
+
+ return res;
+ }
+
+ static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
+ std::vector<clip_image_u8_ptr> output;
+
+ // resize to overview size
+ clip_image_u8_ptr resized_img(clip_image_u8_init());
+ img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
+ inst.padding_overview, inst.pad_color_overview);
+ output.push_back(std::move(resized_img));
+
+ if (inst.slices.empty()) {
+ // no slices, just return the resized image
+ return output;
+ }
+
+ // resize to refined size
+ clip_image_u8_ptr refined_img(clip_image_u8_init());
+ img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined,
+ inst.padding_refined, inst.pad_color_refined);
+
+ // create slices
+ for (const auto & slice : inst.slices) {
+ int x = slice.x;
+ int y = slice.y;
+ int w = slice.size.width;
+ int h = slice.size.height;
+
+ clip_image_u8_ptr img_slice(clip_image_u8_init());
+ img_tool::crop(*refined_img, *img_slice, x, y, w, h);
+ output.push_back(std::move(img_slice));
+ }
+
+ return output;
+ }
+
+private:
+ static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
+ int width = original_size.width;
+ int height = original_size.height;
+ if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
+ float r = static_cast<float>(width) / height;
+ height = static_cast<int>(scale_resolution / std::sqrt(r));
+ width = static_cast<int>(height * r);
+ }
+ clip_image_size res;
+ res.width = ensure_divide(width, patch_size);
+ res.height = ensure_divide(height, patch_size);
+ return res;
+ }
+
+ static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
+ float scale_width = static_cast<float>(target_max.width) / orig.width;
+ float scale_height = static_cast<float>(target_max.height) / orig.height;
+ float scale = std::min(scale_width, scale_height);
+ return clip_image_size{
+ static_cast<int>(orig.width * scale),
+ static_cast<int>(orig.height * scale),
+ };
+ }
+
+ /**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * For example, when given a list of resolutions:
+ * - 100x100
+ * - 200x100
+ * - 100x200
+ * - 200x200
+ *
+ * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
+ *
+ * @param original_size The original size of the image
+ * @param possible_resolutions A list of possible resolutions
+ * @return The best fit resolution
+ */
+ static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
+ clip_image_size best_fit;
+ int min_wasted_area = std::numeric_limits<int>::max();
+ int max_effective_resolution = 0;
+
+ for (const clip_image_size & candidate : possible_resolutions) {
+ auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
+ int effective_resolution = std::min(
+ target_size.width * target_size.height,
+ original_size.width * original_size.height);
+ int wasted_area = (candidate.width * candidate.height) - effective_resolution;
+
+ if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
+ max_effective_resolution = effective_resolution;
+ min_wasted_area = wasted_area;
+ best_fit = candidate;
+ }
+
+ LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
+ }
+
+ return best_fit;
+ }
+
+ static int ensure_divide(int length, int patch_size) {
+ return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
+ }
+
+ static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
+ int width = original_size.width;
+ int height = original_size.height;
+ int grid_x = grid.width;
+ int grid_y = grid.height;
+
+ int refine_width = ensure_divide(width, grid_x);
+ int refine_height = ensure_divide(height, grid_y);
+
+ clip_image_size grid_size;
+ grid_size.width = refine_width / grid_x;
+ grid_size.height = refine_height / grid_y;
+
+ auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale);
+ int best_grid_width = best_grid_size.width;
+ int best_grid_height = best_grid_size.height;
+
+ clip_image_size refine_size;
+ refine_size.width = best_grid_width * grid_x;
+ refine_size.height = best_grid_height * grid_y;
+ return refine_size;
+ }
+
+ static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
+ std::vector<int> candidate_split_grids_nums;
+ for (int i : {multiple - 1, multiple, multiple + 1}) {
+ if (i == 1 || i > max_slice_nums) {
+ continue;
+ }
+ candidate_split_grids_nums.push_back(i);
+ }
+
+ std::vector<clip_image_size> candidate_grids;
+ for (int split_grids_nums : candidate_split_grids_nums) {
+ int m = 1;
+ while (m <= split_grids_nums) {
+ if (split_grids_nums % m == 0) {
+ candidate_grids.push_back(clip_image_size{m, split_grids_nums / m});
+ }
+ ++m;
+ }
+ }
+
+ clip_image_size best_grid{1, 1};
+ float min_error = std::numeric_limits<float>::infinity();
+ for (const auto& grid : candidate_grids) {
+ float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height));
+ if (error < min_error) {
+ best_grid = grid;
+ min_error = error;
+ }
+ }
+ return best_grid;
+ }
+};
+
+// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
+// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout)
+struct lfm2_vl_image_processor {
+ // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
+ static constexpr int min_tiles = 2;
+ static constexpr int max_tiles = 10;
+ static constexpr float max_pixels_tolerance = 2.0f;
+ static constexpr int tile_size = 512;
+
+ static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
+ llava_uhd::slice_instructions inst;
+ const auto & params = ctx->model.hparams;
+ const int align_size = params.patch_size * params.n_merge;
+
+ inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
+ inst.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR;
+ inst.overview_size = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels);
+
+ // tile if either dimension exceeds tile_size with tolerance
+ const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance;
+
+ if (!needs_tiling) {
+ inst.refined_size = clip_image_size{0, 0};
+ inst.grid_size = clip_image_size{0, 0};
+ return inst;
+ }
+
+ const clip_image_size grid = get_grid_layout(original_size.height, original_size.width);
+
+ inst.grid_size = grid;
+ inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height};
+
+ LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+ __func__,
+ original_size.width, original_size.height,
+ inst.overview_size.width, inst.overview_size.height,
+ inst.refined_size.width, inst.refined_size.height,
+ grid.width, grid.height);
+
+ for (int row = 0; row < grid.height; row++) {
+ for (int col = 0; col < grid.width; col++) {
+ llava_uhd::slice_coordinates slice;
+ slice.x = col * tile_size;
+ slice.y = row * tile_size;
+ slice.size = clip_image_size{tile_size, tile_size};
+ inst.slices.push_back(slice);
+ LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n",
+ __func__, (int)inst.slices.size() - 1,
+ slice.x, slice.y, slice.size.width, slice.size.height);
+ }
+ }
+
+ return inst;
+ }
+
+private:
+ static clip_image_size find_closest_aspect_ratio(
+ float aspect_ratio,
+ const std::vector<clip_image_size> & target_ratios,
+ int width, int height) {
+ float best_ratio_diff = std::numeric_limits<float>::max();
+ clip_image_size best_ratio = {1, 1};
+ const float area = static_cast<float>(width * height);
+
+ for (const auto & ratio : target_ratios) {
+ const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
+ const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
+ if (ratio_diff < best_ratio_diff) {
+ best_ratio_diff = ratio_diff;
+ best_ratio = ratio;
+ } else if (ratio_diff == best_ratio_diff) {
+ const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
+ if (area > 0.5f * target_area) {
+ best_ratio = ratio;
+ }
+ }
+ }
+ return best_ratio;
+ }
+
+ static std::vector<clip_image_size> get_target_ratios() {
+ std::vector<clip_image_size> ratios;
+ for (int n = min_tiles; n <= max_tiles; n++) {
+ for (int w = 1; w <= n; w++) {
+ for (int h = 1; h <= n; h++) {
+ if (w * h >= min_tiles && w * h <= max_tiles) {
+ bool found = false;
+ for (const auto & r : ratios) {
+ if (r.width == w && r.height == h) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ ratios.push_back({w, h});
+ }
+ }
+ }
+ }
+ }
+ std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
+ return a.width * a.height < b.width * b.height;
+ });
+ return ratios;
+ }
+
+ static clip_image_size get_grid_layout(int height, int width) {
+ const float aspect_ratio = static_cast<float>(width) / height;
+ const auto ratios = get_target_ratios();
+ return find_closest_aspect_ratio(aspect_ratio, ratios, width, height);
+ }
+};
+
+// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
+// res_imgs memory is being allocated here, previous allocations will be freed if found
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
+ clip_image_size original_size{img->nx, img->ny};
+ auto & params = ctx->model.hparams;
+
+ switch (ctx->proj_type()) {
+ case PROJECTOR_TYPE_MINICPMV:
+ {
+ auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+ for (size_t i = 0; i < imgs.size(); ++i) {
+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+ clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(res));
+ }
+
+ res_imgs->grid_x = inst.grid_size.width;
+ res_imgs->grid_y = inst.grid_size.height;
+ } break;
+
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_QWEN3VL:
+ case PROJECTOR_TYPE_GLM4V:
+ {
+ GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
+ clip_image_u8 resized;
+ const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
+ original_size,
+ params.patch_size * 2,
+ params.image_min_pixels,
+ params.image_max_pixels);
+ img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
+ // clip_image_save_to_bmp(resized, "preproc.bmp");
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
+ // clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
+ // res_imgs->data[0] = *res;
+ res_imgs->entries.push_back(std::move(img_f32));
+ } break;
+ case PROJECTOR_TYPE_YOUTUVL:
+ {
+ const int patch_size = params.patch_size; // typically 16
+ const int merge_size = params.n_merge; // typically 2
+ const int align_size = patch_size * merge_size; // 32
+
+ const int max_num_patches = params.image_max_pixels > 0 ?
+ params.image_max_pixels / (patch_size * patch_size) : 256;
+
+ // Linear search for optimal scale to fit within max_num_patches
+ float scale = 1.0f;
+ int target_height = original_size.height;
+ int target_width = original_size.width;
+
+ auto get_scaled_image_size = [align_size](float scale, int size) -> int {
+ float scaled_size = size * scale;
+ // Round up to nearest multiple of align_size
+ int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
+ // Ensure at least one patch
+ return std::max(align_size, aligned);
+ };
+
+ // Linear search with 0.02 step size
+ while (scale > 0.0f) {
+ target_height = get_scaled_image_size(scale, original_size.height);
+ target_width = get_scaled_image_size(scale, original_size.width);
+
+ int num_patches_h = target_height / patch_size;
+ int num_patches_w = target_width / patch_size;
+ int num_patches = num_patches_h * num_patches_w;
+
+ if (num_patches > max_num_patches) {
+ scale -= 0.02f;
+ } else {
+ break;
+ }
+ }
+
+ clip_image_size new_size = {target_width, target_height};
+
+ // Resize the image
+ clip_image_u8 resized;
+ img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
+
+ // Normalize to float32
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
+ normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
+
+ // Add to results
+ res_imgs->entries.push_back(std::move(img_f32));
+ } break;
+
+ case PROJECTOR_TYPE_IDEFICS3:
+ {
+ // The refined size has two steps:
+ // 1. Resize w/ aspect-ratio preserving such that the longer side is
+ // the preprocessor longest size
+ // 2. Resize w/out preserving aspect ratio such that both sides are
+ // multiples of image_size (always rounding up)
+ //
+ // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
+ const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
+ original_size, params.image_size, params.image_longest_edge);
+ // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
+ // __func__, original_size.width, original_size.height,
+ // refined_size.width, refined_size.height);
+
+ llava_uhd::slice_instructions instructions;
+ instructions.overview_size = clip_image_size{params.image_size, params.image_size};
+ instructions.refined_size = refined_size;
+ instructions.grid_size = clip_image_size{
+ static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
+ static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
+ };
+ for (int y = 0; y < refined_size.height; y += params.image_size) {
+ for (int x = 0; x < refined_size.width; x += params.image_size) {
+ // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
+ instructions.slices.push_back(llava_uhd::slice_coordinates{
+ /* x */x,
+ /* y */y,
+ /* size */clip_image_size{
+ std::min(params.image_size, refined_size.width - x),
+ std::min(params.image_size, refined_size.height - y)
+ }
+ });
+ }
+ }
+ auto imgs = llava_uhd::slice_image(img, instructions);
+
+ // cast and normalize to f32
+ for (size_t i = 0; i < imgs.size(); ++i) {
+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+ clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(res));
+ }
+
+ res_imgs->grid_x = instructions.grid_size.width;
+ res_imgs->grid_y = instructions.grid_size.height;
+ } break;
+
+ case PROJECTOR_TYPE_GLM_EDGE:
+ case PROJECTOR_TYPE_GEMMA3:
+ case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
+ {
+ clip_image_u8 resized_image;
+ int sz = params.image_size;
+ img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR);
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
+ //clip_image_save_to_bmp(resized_image, "resized.bmp");
+ normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(img_f32));
+ } break;
+
+ case PROJECTOR_TYPE_GEMMA3NV:
+ {
+ clip_image_u8 resized_image;
+ int sz = params.image_size;
+ img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
+ normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(img_f32));
+ } break;
+
+ case PROJECTOR_TYPE_JANUS_PRO:
+ {
+ // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
+ const std::array<uint8_t, 3> pad_color = {127, 127, 127};
+ clip_image_u8 resized_image;
+ int sz = params.image_size;
+ img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
+ normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(img_f32));
+ } break;
+
+ case PROJECTOR_TYPE_PIXTRAL:
+ case PROJECTOR_TYPE_LIGHTONOCR:
+ {
+ GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
+ clip_image_u8 resized_image;
+ // the original pixtral model doesn't have n_merge
+ const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge;
+ const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
+ original_size,
+ params.patch_size * cur_merge,
+ params.image_min_pixels,
+ params.image_max_pixels);
+ img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
+ normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(img_f32));
+ } break;
+
+ case PROJECTOR_TYPE_LLAMA4:
+ {
+ GGML_ASSERT(!params.image_res_candidates.empty());
+ auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+ for (size_t i = 0; i < imgs.size(); ++i) {
+ clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(res));
+ }
+
+ res_imgs->grid_x = inst.grid_size.width;
+ res_imgs->grid_y = inst.grid_size.height;
+ } break;
+
+ case PROJECTOR_TYPE_LFM2:
+ {
+ auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size);
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+ for (size_t i = 0; i < imgs.size(); ++i) {
+ clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(res));
+ }
+
+ res_imgs->grid_x = inst.grid_size.width;
+ res_imgs->grid_y = inst.grid_size.height;
+ } break;
+
+ case PROJECTOR_TYPE_KIMIVL:
+ {
+ GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
+ const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
+ original_size,
+ params.patch_size * params.n_merge,
+ params.image_min_pixels,
+ params.image_max_pixels);
+ const std::array<uint8_t, 3> pad_color = {122, 116, 104};
+
+ clip_image_u8 resized_img;
+ img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
+ clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(res));
+ } break;
+
+ case PROJECTOR_TYPE_KIMIK25:
+ {
+ GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
+ const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
+ original_size,
+ params.patch_size * params.n_merge,
+ params.image_min_pixels,
+ params.image_max_pixels);
+ const std::array<uint8_t, 3> pad_color = {0, 0, 0};
+
+ clip_image_u8 resized_img;
+ img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color);
+ clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(res));
+ } break;
+
+ case PROJECTOR_TYPE_MLP:
+ case PROJECTOR_TYPE_MLP_NORM:
+ case PROJECTOR_TYPE_LDP:
+ case PROJECTOR_TYPE_LDPV2:
+ case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm?
+ {
+ // TODO @ngxson : refactor the code below to avoid duplicated logic
+
+ // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
+ // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
+
+ clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
+
+ // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
+ if (params.image_res_candidates.empty()) { // pad_to_square
+ // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
+ // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
+ const int longer_side = std::max(img->nx, img->ny);
+ temp->nx = longer_side;
+ temp->ny = longer_side;
+ temp->buf.resize(3 * longer_side * longer_side);
+
+ // background color in RGB from LLaVA (this is the mean rgb color * 255)
+ const std::array<uint8_t, 3> pad_color = {122, 116, 104};
+
+ // resize the image to the target_size
+ img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
+
+ clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(res));
+
+ } else {
+ // "spatial_unpad" with "anyres" processing for llava-1.6
+ auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+ for (size_t i = 0; i < imgs.size(); ++i) {
+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+ clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(res));
+ }
+ }
+ } break;
+
+ default:
+ LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
+ return false;
+ }
+
+ return true;
+}
+
+ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
+ return ctx->model.image_newline;
+}
+
+void clip_free(clip_ctx * ctx) {
+ if (ctx == nullptr) {
+ return;
+ }
+ delete ctx;
+}
+
+// deprecated
+size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
+ const int32_t nx = ctx->model.hparams.image_size;
+ const int32_t ny = ctx->model.hparams.image_size;
+ return clip_embd_nbytes_by_img(ctx, nx, ny);
+}
+
+size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
+ clip_image_f32 img;
+ img.nx = img_w;
+ img.ny = img_h;
+ return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
+}
+
+int32_t clip_get_image_size(const struct clip_ctx * ctx) {
+ return ctx->model.hparams.image_size;
+}
+
+int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
+ return ctx->model.hparams.patch_size;
+}
+
+int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
+ return ctx->model.hparams.n_embd;
+}
+
+const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
+ return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
+}
+
+int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+ const auto & params = ctx->model.hparams;
+ const int n_total = clip_n_output_tokens(ctx, img);
+ const auto & proj = ctx->proj_type();
+ switch (proj) {
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_QWEN3VL:
+ case PROJECTOR_TYPE_GLM4V:
+ case PROJECTOR_TYPE_YOUTUVL:
+ return (img->nx / params.patch_size) / 2;
+ default:
+ break;
+ }
+ return n_total;
+}
+
+int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+ const auto & params = ctx->model.hparams;
+ const auto & proj = ctx->proj_type();
+ switch (proj) {
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_QWEN3VL:
+ case PROJECTOR_TYPE_GLM4V:
+ case PROJECTOR_TYPE_YOUTUVL:
+ return (img->ny / params.patch_size) / 2;
+ default:
+ break;
+ }
+ return 1;
+}
+
+int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+ const auto & params = ctx->model.hparams;
+
+ // for models with fixed size image, the input image is already pre-processed and resized to square
+ int patch_size = params.patch_size;
+ int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
+
+ projector_type proj = ctx->proj_type();
+
+ switch (proj) {
+ case PROJECTOR_TYPE_MLP:
+ case PROJECTOR_TYPE_MLP_NORM:
+ case PROJECTOR_TYPE_JANUS_PRO:
+ {
+ // do nothing
+ } break;
+ case PROJECTOR_TYPE_LDP:
+ case PROJECTOR_TYPE_LDPV2:
+ case PROJECTOR_TYPE_GLM_EDGE:
+ {
+ n_patches /= 4;
+ if (ctx->model.mm_boi) {
+ n_patches += 2; // for BOI and EOI token embeddings
+ }
+ } break;
+ case PROJECTOR_TYPE_MINICPMV:
+ {
+ // Use actual config value if available, otherwise fall back to hardcoded values
+ if (params.minicpmv_query_num > 0) {
+ n_patches = params.minicpmv_query_num;
+ } else {
+ // Fallback to hardcoded values for legacy models
+ if (params.minicpmv_version == 2) {
+ n_patches = 96;
+ } else if (params.minicpmv_version == 3) {
+ n_patches = 64;
+ } else if (params.minicpmv_version == 4) {
+ n_patches = 64;
+ } else if (params.minicpmv_version == 5) {
+ // MiniCPM-V 4.0
+ n_patches = 64;
+ } else if (params.minicpmv_version == 6) {
+ // MiniCPM-V 4.5
+ n_patches = 64;
+ } else if (params.minicpmv_version == 100045) {
+ // MiniCPM-o 4.5
+ n_patches = 64;
+ } else {
+ GGML_ABORT("Unknown minicpmv version");
+ }
+ }
+ } break;
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_QWEN3VL:
+ case PROJECTOR_TYPE_GLM4V:
+ case PROJECTOR_TYPE_YOUTUVL:
+ {
+ // dynamic size (2 conv, so double patch size)
+ int x_patch = img->nx / (params.patch_size * 2);
+ int y_patch = img->ny / (params.patch_size * 2);
+ n_patches = x_patch * y_patch;
+ } break;
+ case PROJECTOR_TYPE_GEMMA3:
+ case PROJECTOR_TYPE_IDEFICS3:
+ case PROJECTOR_TYPE_INTERNVL:
+ case PROJECTOR_TYPE_LLAMA4:
+ {
+ // both X and Y are downscaled by the scale factor
+ int scale_factor = ctx->model.hparams.n_merge;
+ n_patches /= (scale_factor * scale_factor);
+ } break;
+ case PROJECTOR_TYPE_GEMMA3NV:
+ {
+ // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
+ // regardless of input size (see architecture description)
+ n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
+ } break;
+ case PROJECTOR_TYPE_LFM2:
+ case PROJECTOR_TYPE_KIMIVL:
+ case PROJECTOR_TYPE_KIMIK25:
+ {
+ // dynamic size
+ int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
+ int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
+ int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
+ n_patches = x_patch * y_patch;
+ } break;
+ case PROJECTOR_TYPE_PIXTRAL:
+ case PROJECTOR_TYPE_LIGHTONOCR:
+ {
+ // dynamic size
+ int n_merge = ctx->model.hparams.n_merge;
+ int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
+ int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
+ if (ctx->model.token_embd_img_break) {
+ n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
+ } else {
+ n_patches = n_patches_y * n_patches_x;
+ }
+ } break;
+ case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_ULTRAVOX:
+ case PROJECTOR_TYPE_QWEN2A:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ {
+ n_patches = img->nx;
+
+ const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
+ if (ctx->model.audio_has_stack_frames()) {
+ GGML_ASSERT(proj_stack_factor > 0);
+ const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor);
+ n_patches = n_len / proj_stack_factor;
+ }
+
+ // whisper downscales input token by half after conv1d
+ n_patches /= 2;
+
+ if (ctx->model.audio_has_avgpool()) {
+ // divide by 2 because of nn.AvgPool1d(2, stride=2)
+ n_patches /= 2;
+ }
+ } break;
+ case PROJECTOR_TYPE_GLMA:
+ {
+ n_patches = img->nx;
+ // whisper downscales input token by half after conv1d
+ n_patches /= 2;
+ // reshape by merge_factor
+ n_patches /= ctx->model.hparams.proj_stack_factor;
+ // for BOI and EOI token embeddings
+ n_patches += 2;
+ } break;
+ case PROJECTOR_TYPE_COGVLM:
+ {
+ n_patches += 2; // for BOI and EOI token embeddings
+ } break;
+ case PROJECTOR_TYPE_LFM2A:
+ {
+ n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
+ } break;
+ default:
+ GGML_ABORT("unsupported projector type");
+ }
+
+ return n_patches;
+}
+
+bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
+ clip_image_f32_batch imgs;
+ clip_image_f32_ptr img_copy(clip_image_f32_init());
+ *img_copy = *img;
+ imgs.entries.push_back(std::move(img_copy));
+
+ return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
+}
+
+bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
+ const clip_image_f32_batch & imgs = *imgs_c_ptr;
+ int batch_size = imgs.entries.size();
+
+ // TODO @ngxson : implement batch size > 1 as a loop
+ // we don't need true batching support because the cgraph will gonna be big anyway
+ if (batch_size != 1) {
+ return false; // only support batch size of 1
+ }
+
+ // if buffers are not allocated, we need to do a warmup run to allocate them
+ if (!ctx->is_allocated) {
+ clip_model_loader::warmup(*ctx, *imgs_c_ptr);
+ }
+
+ // build the inference graph
+ ggml_backend_sched_reset(ctx->sched.get());
+ ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
+ ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
+
+ // set inputs
+ const auto & model = ctx->model;
+ const auto & hparams = model.hparams;
+
+ const int image_size_width = imgs.entries[0]->nx;
+ const int image_size_height = imgs.entries[0]->ny;
+
+ const int patch_size = hparams.patch_size;
+ const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
+ const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
+ const int pos_w = image_size_width / patch_size;
+ const int pos_h = image_size_height / patch_size;
+
+
+ auto get_inp_tensor = [&gf](const char * name) {
+ ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
+ if (inp == nullptr) {
+ GGML_ABORT("Failed to get tensor %s", name);
+ }
+ if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
+ GGML_ABORT("Tensor %s is not an input tensor", name);
+ }
+ return inp;
+ };
+
+ auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
+ ggml_tensor * cur = get_inp_tensor(name);
+ GGML_ASSERT(cur->type == GGML_TYPE_F32);
+ GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+ ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+ };
+
+ auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
+ ggml_tensor * cur = get_inp_tensor(name);
+ GGML_ASSERT(cur->type == GGML_TYPE_I32);
+ GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+ ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+ };
+
+ // set input pixel values
+ if (!imgs.is_audio) {
+ size_t nelem = 0;
+ for (const auto & img : imgs.entries) {
+ nelem += img->nx * img->ny * 3;
+ }
+ std::vector<float> inp_raw(nelem);
+
+ // layout of data (note: the channel dim is unrolled to better visualize the layout):
+ //
+ // ┌──W──┐
+ // │ H │ channel = R
+ // ├─────┤ │
+ // │ H │ channel = G
+ // ├─────┤ │
+ // │ H │ channel = B
+ // └─────┘ │
+ // ──────┘ x B
+
+ for (size_t i = 0; i < imgs.entries.size(); i++) {
+ const int nx = imgs.entries[i]->nx;
+ const int ny = imgs.entries[i]->ny;
+ const int n = nx * ny;
+
+ for (int b = 0; b < batch_size; b++) {
+ float * batch_entry = inp_raw.data() + b * (3*n);
+ for (int y = 0; y < ny; y++) {
+ for (int x = 0; x < nx; x++) {
+ size_t base_src = 3*(y * nx + x); // idx of the first channel
+ size_t base_dst = y * nx + x; // idx of the first channel
+ batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
+ batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
+ batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
+ }
+ }
+ }
+ }
+ set_input_f32("inp_raw", inp_raw);
+
+ } else {
+ // audio input
+ GGML_ASSERT(imgs.entries.size() == 1);
+ const auto & mel_inp = imgs.entries[0];
+ const int n_step = mel_inp->nx;
+ const int n_mel = mel_inp->ny;
+ std::vector<float> inp_raw(n_step * n_mel);
+ std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
+ set_input_f32("inp_raw", inp_raw);
+ }
+
+ // set input per projector
+ switch (ctx->model.proj_type) {
+ case PROJECTOR_TYPE_MINICPMV:
+ {
+ // inspired from siglip:
+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
+ std::vector<int32_t> positions(pos_h * pos_w);
+ int bucket_coords_h[1024];
+ int bucket_coords_w[1024];
+ for (int i = 0; i < pos_h; i++){
+ bucket_coords_h[i] = std::floor(70.0*i/pos_h);
+ }
+ for (int i = 0; i < pos_w; i++){
+ bucket_coords_w[i] = std::floor(70.0*i/pos_w);
+ }
+ for (int i = 0, id = 0; i < pos_h; i++){
+ for (int j = 0; j < pos_w; j++){
+ positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
+ }
+ }
+ set_input_i32("positions", positions);
+
+ // inputs for resampler projector
+ // set the 2D positions (using float for sinusoidal embedding)
+ int n_patches_per_col = image_size_width / patch_size;
+ std::vector<float> pos_data(n_pos);
+ // dimension H
+ for (int i = 0; i < n_pos; i++) {
+ pos_data[i] = static_cast<float>(i / n_patches_per_col);
+ }
+ set_input_f32("pos_h", pos_data);
+ // dimension W
+ for (int i = 0; i < n_pos; i++) {
+ pos_data[i] = static_cast<float>(i % n_patches_per_col);
+ }
+ set_input_f32("pos_w", pos_data);
+ // base frequency omega
+ const float base_freq = 10000.0f;
+ const int n_embd_proj = clip_n_mmproj_embd(ctx);
+ std::vector<float> omega(n_embd_proj / 4);
+ for (int i = 0; i < n_embd_proj / 4; ++i) {
+ omega[i] = 1.0f / std::pow(base_freq, static_cast<float>(i) / (n_embd_proj / 4));
+ }
+ set_input_f32("omega", omega);
+ } break;
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN3VL:
+ case PROJECTOR_TYPE_GLM4V:
+ {
+ const int merge_ratio = hparams.n_merge;
+ const int pw = image_size_width / patch_size;
+ const int ph = image_size_height / patch_size;
+ std::vector<int> positions(n_pos * 4);
+ int ptr = 0;
+ for (int y = 0; y < ph; y += merge_ratio) {
+ for (int x = 0; x < pw; x += merge_ratio) {
+ for (int dy = 0; dy < 2; dy++) {
+ for (int dx = 0; dx < 2; dx++) {
+ positions[ ptr] = y + dy;
+ positions[ num_patches + ptr] = x + dx;
+ positions[2 * num_patches + ptr] = y + dy;
+ positions[3 * num_patches + ptr] = x + dx;
+ ptr++;
+ }
+ }
+ }
+ }
+
+ set_input_i32("positions", positions);
+ } break;
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_YOUTUVL:
+ {
+ // pw * ph = number of tokens output by ViT after apply patch merger
+ // ipw * ipw = number of vision token been processed inside ViT
+ const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
+ const int merge_ratio = 2;
+ const int pw = image_size_width / patch_size / merge_ratio;
+ const int ph = image_size_height / patch_size / merge_ratio;
+ const int ipw = image_size_width / patch_size;
+ const int iph = image_size_height / patch_size;
+
+ std::vector<int> idx (ph * pw);
+ std::vector<int> inv_idx(ph * pw);
+
+ if (use_window_attn) {
+ const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
+ const int grid_window = attn_window_size / patch_size / merge_ratio;
+ int dst = 0;
+ // [num_vision_tokens, num_vision_tokens] attention mask tensor
+ std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
+ int mask_row = 0;
+
+ for (int y = 0; y < ph; y += grid_window) {
+ for (int x = 0; x < pw; x += grid_window) {
+ const int win_h = std::min(grid_window, ph - y);
+ const int win_w = std::min(grid_window, pw - x);
+ const int dst_0 = dst;
+ // group all tokens belong to the same window togather (to a continue range)
+ for (int dy = 0; dy < win_h; dy++) {
+ for (int dx = 0; dx < win_w; dx++) {
+ const int src = (y + dy) * pw + (x + dx);
+ GGML_ASSERT(src < (int)idx.size());
+ GGML_ASSERT(dst < (int)inv_idx.size());
+ idx [src] = dst;
+ inv_idx[dst] = src;
+ dst++;
+ }
+ }
+
+ for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
+ int row_offset = mask_row * (ipw * iph);
+ std::fill(
+ mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
+ mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
+ 0.0);
+ mask_row++;
+ }
+ }
+ }
+
+ set_input_i32("window_idx", idx);
+ set_input_i32("inv_window_idx", inv_idx);
+ set_input_f32("window_mask", mask);
+ } else {
+ for (int i = 0; i < ph * pw; i++) {
+ idx[i] = i;
+ }
+ }
+
+ const int mpow = merge_ratio * merge_ratio;
+ std::vector<int> positions(n_pos * 4);
+
+ int ptr = 0;
+ for (int y = 0; y < iph; y += merge_ratio) {
+ for (int x = 0; x < ipw; x += merge_ratio) {
+ for (int dy = 0; dy < 2; dy++) {
+ for (int dx = 0; dx < 2; dx++) {
+ auto remap = idx[ptr / mpow];
+ remap = (remap * mpow) + (ptr % mpow);
+
+ positions[ remap] = y + dy;
+ positions[ num_patches + remap] = x + dx;
+ positions[2 * num_patches + remap] = y + dy;
+ positions[3 * num_patches + remap] = x + dx;
+ ptr++;
+ }
+ }
+ }
+ }
+
+ set_input_i32("positions", positions);
+ } break;
+ case PROJECTOR_TYPE_PIXTRAL:
+ case PROJECTOR_TYPE_KIMIVL:
+ case PROJECTOR_TYPE_KIMIK25:
+ case PROJECTOR_TYPE_LIGHTONOCR:
+ {
+ // set the 2D positions
+ int n_patches_per_col = image_size_width / patch_size;
+ std::vector<int> pos_data(n_pos);
+ // dimension H
+ for (int i = 0; i < n_pos; i++) {
+ pos_data[i] = i / n_patches_per_col;
+ }
+ set_input_i32("pos_h", pos_data);
+ // dimension W
+ for (int i = 0; i < n_pos; i++) {
+ pos_data[i] = i % n_patches_per_col;
+ }
+ set_input_i32("pos_w", pos_data);
+ } break;
+ case PROJECTOR_TYPE_GLM_EDGE:
+ {
+ // llava and other models
+ std::vector<int32_t> positions(n_pos);
+ for (int i = 0; i < n_pos; i++) {
+ positions[i] = i;
+ }
+ set_input_i32("positions", positions);
+ } break;
+ case PROJECTOR_TYPE_MLP:
+ case PROJECTOR_TYPE_MLP_NORM:
+ case PROJECTOR_TYPE_LDP:
+ case PROJECTOR_TYPE_LDPV2:
+ {
+ // llava and other models
+ std::vector<int32_t> positions(n_pos);
+ for (int i = 0; i < n_pos; i++) {
+ positions[i] = i;
+ }
+ set_input_i32("positions", positions);
+
+ // The patches vector is used to get rows to index into the embeds with;
+ // we should skip dim 0 only if we have CLS to avoid going out of bounds
+ // when retrieving the rows.
+ int patch_offset = model.class_embedding ? 1 : 0;
+ std::vector<int32_t> patches(num_patches);
+ for (int i = 0; i < num_patches; i++) {
+ patches[i] = i + patch_offset;
+ }
+ set_input_i32("patches", patches);
+ } break;
+ case PROJECTOR_TYPE_GEMMA3:
+ case PROJECTOR_TYPE_GEMMA3NV:
+ case PROJECTOR_TYPE_IDEFICS3:
+ case PROJECTOR_TYPE_INTERNVL:
+ case PROJECTOR_TYPE_QWEN2A:
+ case PROJECTOR_TYPE_GLMA:
+ case PROJECTOR_TYPE_ULTRAVOX:
+ case PROJECTOR_TYPE_LFM2:
+ case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ case PROJECTOR_TYPE_JANUS_PRO:
+ case PROJECTOR_TYPE_COGVLM:
+ {
+ // do nothing
+ } break;
+ case PROJECTOR_TYPE_LLAMA4:
+ {
+ // set the 2D positions
+ int n_patches_per_col = image_size_width / patch_size;
+ std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
+ // last pos is always kept 0, it's for CLS
+ // dimension H
+ for (int i = 0; i < num_patches; i++) {
+ pos_data[i] = (i / n_patches_per_col) + 1;
+ }
+ set_input_i32("pos_h", pos_data);
+ // dimension W
+ for (int i = 0; i < num_patches; i++) {
+ pos_data[i] = (i % n_patches_per_col) + 1;
+ }
+ set_input_i32("pos_w", pos_data);
+ } break;
+ case PROJECTOR_TYPE_LFM2A:
+ {
+ GGML_ASSERT(imgs.entries.size() == 1);
+ const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
+
+ auto d_model = 512;
+ auto seq_len = n_frames * 2 - 1;
+ std::vector<float> pos_emb(d_model*seq_len);
+ std::vector<double> inv_freq(d_model / 2);
+ for (size_t i = 0; i < inv_freq.size(); ++i) {
+ inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
+ }
+ for (int64_t pos = 0; pos < seq_len; ++pos) {
+ for (size_t i = 0; i < inv_freq.size(); ++i) {
+ const float ang = (n_frames - pos - 1) * inv_freq[i];
+ pos_emb[pos*d_model + 2*i + 0] = sinf(ang); // even
+ pos_emb[pos*d_model + 2*i + 1] = cosf(ang); // odd
+ }
+ }
+ set_input_f32("pos_emb", pos_emb);
+ } break;
+ default:
+ GGML_ABORT("Unknown projector type");
+ }
+
+ // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
+ ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+ if (reg) {
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+ if (ggml_backend_set_n_threads_fn) {
+ ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
+ }
+ }
+
+ auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
+ if (status != GGML_STATUS_SUCCESS) {
+ LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
+ return false;
+ }
+
+ // the last node is the embedding tensor
+ ggml_tensor * embeddings = ggml_graph_node(gf, -1);
+
+ // sanity check (only support batch size of 1 for now)
+ const int n_tokens_out = embeddings->ne[1];
+ const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
+ if (n_tokens_out != expected_n_tokens_out) {
+ LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
+ GGML_ABORT("Invalid number of output tokens");
+ }
+
+ // copy the embeddings to the location passed by the user
+ if (vec != nullptr) {
+ ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+ }
+
+ // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
+ if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
+ const int64_t n_embd = embeddings->ne[0];
+ const int64_t n_tokens = embeddings->ne[1];
+ std::vector<float> emb_data(n_embd * n_tokens);
+ ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));
+
+ LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
+ LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens);
+
+ // Print first few values of first token
+ LOG_INF("Token 0 (first 16 values): ");
+ for (int i = 0; i < std::min((int64_t)16, n_embd); i++) {
+ LOG_INF("%.6f ", emb_data[i]);
+ }
+ LOG_INF("\n");
+
+ // Print last few values of first token
+ if (n_embd > 16) {
+ LOG_INF("Token 0 (last 16 values): ");
+ for (int64_t i = n_embd - 16; i < n_embd; i++) {
+ LOG_INF("%.6f ", emb_data[i]);
+ }
+ LOG_INF("\n");
+ }
+
+ // Compute and print statistics
+ float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0];
+ for (size_t i = 0; i < emb_data.size(); i++) {
+ sum += emb_data[i];
+ sum_sq += emb_data[i] * emb_data[i];
+ min_val = std::min(min_val, emb_data[i]);
+ max_val = std::max(max_val, emb_data[i]);
+ }
+ float mean = sum / emb_data.size();
+ float variance = (sum_sq / emb_data.size()) - (mean * mean);
+ LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n",
+ mean, sqrtf(variance), min_val, max_val, sum);
+ LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n");
+ }
+
+ return true;
+}
+
+int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
+ switch (ctx->model.proj_type) {
+ case PROJECTOR_TYPE_LDP:
+ return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
+ case PROJECTOR_TYPE_LDPV2:
+ return ctx->model.mm_model_peg_0_b->ne[0];
+ case PROJECTOR_TYPE_MLP:
+ case PROJECTOR_TYPE_PIXTRAL:
+ case PROJECTOR_TYPE_LIGHTONOCR:
+ return ctx->model.mm_2_w->ne[1];
+ case PROJECTOR_TYPE_MLP_NORM:
+ return ctx->model.mm_3_b->ne[0];
+ case PROJECTOR_TYPE_MINICPMV:
+ return ctx->model.mm_model_proj->ne[0];
+ case PROJECTOR_TYPE_GLM_EDGE:
+ return ctx->model.mm_model_mlp_3_w->ne[1];
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_JANUS_PRO:
+ case PROJECTOR_TYPE_YOUTUVL:
+ return ctx->model.mm_1_b->ne[0];
+ case PROJECTOR_TYPE_QWEN3VL:
+ // main path + deepstack paths
+ return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
+ case PROJECTOR_TYPE_GEMMA3:
+ case PROJECTOR_TYPE_GEMMA3NV:
+ return ctx->model.mm_input_proj_w->ne[0];
+ case PROJECTOR_TYPE_IDEFICS3:
+ return ctx->model.projection->ne[1];
+ case PROJECTOR_TYPE_ULTRAVOX:
+ case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ return ctx->model.mm_2_w->ne[1];
+ case PROJECTOR_TYPE_INTERNVL:
+ return ctx->model.mm_3_w->ne[1];
+ case PROJECTOR_TYPE_LLAMA4:
+ return ctx->model.mm_model_proj->ne[1];
+ case PROJECTOR_TYPE_QWEN2A:
+ return ctx->model.mm_fc_w->ne[1];
+ case PROJECTOR_TYPE_GLMA:
+ return ctx->model.mm_2_w->ne[1];
+ case PROJECTOR_TYPE_LFM2:
+ case PROJECTOR_TYPE_KIMIVL:
+ case PROJECTOR_TYPE_KIMIK25:
+ return ctx->model.mm_2_w->ne[1];
+ case PROJECTOR_TYPE_COGVLM:
+ return ctx->model.mm_4h_to_h_w->ne[1];
+ case PROJECTOR_TYPE_LFM2A:
+ return ctx->model.position_embeddings->ne[0];
+ case PROJECTOR_TYPE_GLM4V:
+ return ctx->model.mm_ffn_down_w->ne[1];
+ default:
+ GGML_ABORT("Unknown projector type");
+ }
+}
+
+int clip_is_minicpmv(const struct clip_ctx * ctx) {
+ // TODO: remove this function
+ if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
+ return ctx->model.hparams.minicpmv_version;
+ }
+ return 0;
+}
+
+bool clip_is_glm(const struct clip_ctx * ctx) {
+ // TODO: remove this function
+ return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
+}
+
+bool clip_is_llava(const struct clip_ctx * ctx) {
+ return ctx->model.hparams.has_llava_projector;
+}
+
+bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
+ return ctx->model.modality == CLIP_MODALITY_VISION;
+}
+
+bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
+ return ctx->model.modality == CLIP_MODALITY_AUDIO;
+}
+
+bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
+ switch (ctx->proj_type()) {
+ case PROJECTOR_TYPE_ULTRAVOX:
+ case PROJECTOR_TYPE_QWEN2A:
+ case PROJECTOR_TYPE_GLMA:
+ case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
+ clip_image_f32 clip_img;
+ clip_img.buf.resize(h * w * 3);
+ for (int i = 0; i < h*w*3; i++)
+ {
+ clip_img.buf[i] = img[i];
+ }
+ clip_img.nx = w;
+ clip_img.ny = h;
+ clip_image_encode(ctx, n_threads, &clip_img, vec);
+ return true;
+}
+
+//
+// API used internally with mtmd
+//
+
+projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
+ return ctx->proj_type();
+}
+
+void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
+ clip_image_f32 * audio = new clip_image_f32;
+ audio->nx = n_frames;
+ audio->ny = n_mel;
+ audio->buf.resize(n_frames * n_mel);
+ std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
+
+ batch->entries.push_back(clip_image_f32_ptr(audio));
+ batch->is_audio = true;
+}
+
+const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
+ return &ctx->model.hparams;
+}
+
+//
+// API for debugging
+//
+void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
+ clip_image_f32 img;
+ img.nx = w;
+ img.ny = h;
+ img.buf.resize(h * w * 3);
+ for (int i = 0; i < h * w * 3; i++) {
+ img.buf[i] = static_cast<float>(fill_value);
+ }
+ clip_image_encode(ctx, 1, &img, nullptr);
+ GGML_ASSERT(img.buf.empty() && "expected, always stop here");
+}
diff --git a/llama.cpp/tools/mtmd/clip.h b/llama.cpp/tools/mtmd/clip.h
new file mode 100644
index 0000000..71b5848
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include "ggml.h"
+#include "mtmd.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+// !!! Internal header, to be used by mtmd only !!!
+
+#define MTMD_INTERNAL_HEADER
+
+struct clip_ctx;
+
+struct clip_image_size {
+ int width;
+ int height;
+};
+
+struct clip_image_f32;
+struct clip_image_u8_batch;
+struct clip_image_f32_batch;
+
+enum clip_modality {
+ CLIP_MODALITY_VISION,
+ CLIP_MODALITY_AUDIO,
+};
+
+enum clip_flash_attn_type {
+ CLIP_FLASH_ATTN_TYPE_AUTO = -1,
+ CLIP_FLASH_ATTN_TYPE_DISABLED = 0,
+ CLIP_FLASH_ATTN_TYPE_ENABLED = 1,
+};
+
+struct clip_context_params {
+ bool use_gpu;
+ enum clip_flash_attn_type flash_attn_type;
+ int image_min_tokens;
+ int image_max_tokens;
+ bool warmup;
+ ggml_backend_sched_eval_callback cb_eval;
+ void * cb_eval_user_data;
+};
+
+struct clip_init_result {
+ struct clip_ctx * ctx_v; // vision context
+ struct clip_ctx * ctx_a; // audio context
+};
+
+struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
+
+void clip_free(struct clip_ctx * ctx);
+
+size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
+
+int32_t clip_get_image_size (const struct clip_ctx * ctx);
+int32_t clip_get_patch_size (const struct clip_ctx * ctx);
+int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
+
+// TODO: should be enum, not string
+const char * clip_patch_merge_type(const struct clip_ctx * ctx);
+
+int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+
+// for M-RoPE, this will be the number of token positions in X and Y directions
+// for other models, X will be the total number of tokens and Y will be 1
+int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+
+// this should be equal to the embedding dimension of the text model
+int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+
+struct clip_image_size * clip_image_size_init(void);
+struct clip_image_u8 * clip_image_u8_init (void);
+struct clip_image_f32 * clip_image_f32_init(void);
+struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
+
+// nx, ny are the output image dimensions
+unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
+
+void clip_image_size_free (struct clip_image_size * img_size);
+void clip_image_u8_free (struct clip_image_u8 * img);
+void clip_image_f32_free(struct clip_image_f32 * img);
+void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
+void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+
+// use for accessing underlay data of clip_image_f32_batch
+size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
+size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
+size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
+struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+
+/**
+ * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
+ * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
+ */
+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
+
+/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
+bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
+
+struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
+
+bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
+bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+
+int clip_is_minicpmv(const struct clip_ctx * ctx);
+bool clip_is_glm(const struct clip_ctx * ctx);
+bool clip_is_llava(const struct clip_ctx * ctx);
+// note for contributor: this clip_is_(model) pattern is deprecated
+// do NOT add new functions like this
+
+bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
+
+// use by audio input
+void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
+
+bool clip_has_vision_encoder(const struct clip_ctx * ctx);
+bool clip_has_audio_encoder(const struct clip_ctx * ctx);
+bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
diff --git a/llama.cpp/tools/mtmd/deprecation-warning.cpp b/llama.cpp/tools/mtmd/deprecation-warning.cpp
new file mode 100644
index 0000000..dded0a5
--- /dev/null
+++ b/llama.cpp/tools/mtmd/deprecation-warning.cpp
@@ -0,0 +1,22 @@
+#include <cstdio>
+#include <string>
+
+int main(int argc, char** argv) {
+ std::string filename = "main";
+ if (argc >= 1) {
+ filename = argv[0];
+ }
+
+ // Get only the program name from the full path
+ size_t pos = filename.find_last_of("/\\");
+ if (pos != std::string::npos) {
+ filename = filename.substr(pos+1);
+ }
+
+ fprintf(stdout, "\n");
+ fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
+ fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n");
+ fprintf(stdout, "\n");
+
+ return EXIT_FAILURE;
+}
diff --git a/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py b/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
new file mode 100644
index 0000000..2949fae
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
@@ -0,0 +1,412 @@
+import argparse
+import os
+import json
+import re
+
+import torch
+import numpy as np
+from gguf import *
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+ return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+ if name in (
+ "logit_scale",
+ "text_model.embeddings.position_ids",
+ "vision_model.embeddings.position_ids",
+ ):
+ return True
+
+ if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
+ return True
+
+ if name.startswith("v") and not has_vision:
+ return True
+
+ if name.startswith("t") and not has_text:
+ return True
+
+ return False
+
+
+def get_tensor_name(name: str) -> str:
+ # Standardize the transformers llava next keys for
+ # image newline / mm projector with the classes in haotian-liu LLaVA
+ if name == "image_newline":
+ return "model.image_newline"
+ if name.startswith("multi_modal_projector"):
+ name = name.replace("multi_modal_projector", "mm")
+ if "linear_1" in name:
+ name = name.replace("linear_1", "0")
+ if "linear_2" in name:
+ name = name.replace("linear_2", "2")
+ return name
+
+ if "projection" in name:
+ return name
+ if "mm_projector" in name:
+ name = name.replace("model.mm_projector", "mm")
+ name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+ name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+ return name
+
+ return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+ """
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
+ The reversible bpe codes work on unicode strings.
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+ This is a significant percentage of your normal, say, 32K bpe vocab.
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
+ """
+ bs = (
+ list(range(ord("!"), ord("~") + 1))
+ + list(range(ord("¡"), ord("¬") + 1))
+ + list(range(ord("®"), ord("ÿ") + 1))
+ )
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8 + n)
+ n += 1
+ cs = [chr(n) for n in cs]
+ return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine")
+ap.add_argument("--text-only", action="store_true", required=False,
+ help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+ help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+ help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+
+# Selectable visual encoders that are compatible with this script
+encoder_group = ap.add_mutually_exclusive_group()
+encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+ help="The clip model is from openclip (for ViT-SO400M type))")
+encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False,
+ help="the visual encoder is Siglip.")
+
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+default_image_std = [0.26862954, 0.26130258, 0.27577711]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+ print("--text-only and --image-only arguments cannot be specified at the same time.")
+ exit(1)
+
+if args.use_f32:
+ print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+if (
+ args.clip_model_is_vision or
+ not os.path.exists(dir_model + "/vocab.json") or
+ args.clip_model_is_openclip or
+ args.clip_model_is_siglip
+):
+ vocab = None
+ tokens = None
+else:
+ with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+ vocab = json.load(f)
+ tokens = [key for key in vocab]
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+ config = json.load(f)
+ if args.clip_model_is_vision:
+ v_hparams = config
+ t_hparams = None
+ else:
+ v_hparams = config["vision_config"]
+ t_hparams = config["text_config"]
+
+# possible data types
+# ftype == 0 -> float32
+# ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+ ftype = 0
+
+if args.clip_model_is_siglip:
+ model = SiglipVisionModel.from_pretrained(dir_model)
+ processor = None
+elif args.clip_model_is_vision or args.clip_model_is_openclip:
+ model = CLIPVisionModel.from_pretrained(dir_model)
+ processor = None
+else:
+ model = CLIPModel.from_pretrained(dir_model)
+ processor = CLIPProcessor.from_pretrained(dir_model)
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_llava_projector = False
+if args.text_only:
+ fname_middle = "text-"
+ has_vision_encoder = False
+elif args.llava_projector is not None:
+ fname_middle = "mmproj-"
+ has_text_encoder = False
+ has_llava_projector = True
+elif args.vision_only:
+ fname_middle = "vision-"
+ has_text_encoder = False
+else:
+ fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG)
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_llava_projector", has_llava_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if args.text_only:
+ fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_llava_projector:
+ fout.add_description("vision-only CLIP model")
+elif has_llava_projector:
+ fout.add_description("image encoder for LLaVA")
+ # add projector type
+ fout.add_string("clip.projector_type", args.projector_type)
+else:
+ fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+ assert t_hparams is not None
+ assert tokens is not None
+ if args.clip_model_is_siglip:
+ text_projection_dim = 0
+ else:
+ text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"])
+ # text_model hparams
+ fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+ fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+ fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+ fout.add_uint32("clip.text.projection_dim", text_projection_dim)
+ fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+ fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+ fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+ fout.add_token_list(tokens)
+
+
+
+def get_non_negative_vision_feature_layers(v_hparams):
+ """
+ Determine the vision feature layer(s) for the llava model, which are indices into the
+ hidden states of the visual encoder. Note that the hidden states array generally takes the
+ form:
+
+ [<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>]
+
+ so feature indices should be offset as n+1 to get the output of encoder block n.
+ We convert all vision feature layers to non-negative so that -1 can be used in
+ the model as an unset value. If no vision feature layer is found, we leave it unset.
+ """
+ num_hidden_layers = v_hparams["num_hidden_layers"]
+ to_non_negative = lambda layer_idx: layer_idx if layer_idx >= 0 else num_hidden_layers + layer_idx + 1
+ feature_layers_key = None
+ # Key used for llava models in transformers
+ if "vision_feature_layer" in config:
+ feature_layers_key = "vision_feature_layer"
+ # Key used for llava models in the original format
+ elif "mm_vision_select_layer" in config:
+ feature_layers_key = "mm_vision_select_layer"
+ if feature_layers_key is not None:
+ feature_layers = config[feature_layers_key]
+ if isinstance(feature_layers, int):
+ feature_layers = [feature_layers]
+ return [to_non_negative(feature_layer) for feature_layer in feature_layers]
+
+# Determine if we have explicitly specified vision feature layers in our config
+feature_layers = get_non_negative_vision_feature_layers(v_hparams)
+
+if has_vision_encoder:
+ # Siglip does not have a visual projector; set projection dim to 0
+ if args.clip_model_is_siglip:
+ visual_projection_dim = 0
+ else:
+ visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"])
+
+ # set vision_model hparams
+ fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+ fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+ fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+ fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+ fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
+ fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+ fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
+ if feature_layers:
+ block_count = max(feature_layers)
+ else:
+ block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+ fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+ # /**
+ # "image_grid_pinpoints": [
+ # [
+ # 336,
+ # 672
+ # ],
+ # [
+ # 672,
+ # 336
+ # ],
+ # [
+ # 672,
+ # 672
+ # ],
+ # [
+ # 1008,
+ # 336
+ # ],
+ # [
+ # 336,
+ # 1008
+ # ]
+ # ],
+ # Flattened:
+ # [
+ # 336, 672,
+ # 672, 336,
+ # 672, 672,
+ # 1008, 336,
+ # 336, 1008
+ # ]
+ # *
+ # */
+ if "image_grid_pinpoints" in v_hparams:
+ # flatten it
+ image_grid_pinpoints = []
+ for pinpoint in v_hparams["image_grid_pinpoints"]:
+ for p in pinpoint:
+ image_grid_pinpoints.append(p)
+ fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
+ if "image_crop_resolution" in v_hparams:
+ fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
+ if "image_aspect_ratio" in v_hparams:
+ fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
+ if "image_split_resolution" in v_hparams:
+ fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
+ if "mm_patch_merge_type" in v_hparams:
+ fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
+ if "mm_projector_type" in v_hparams:
+ fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
+ if feature_layers:
+ fout.add_array("clip.vision.feature_layer", feature_layers)
+
+ if processor is not None:
+ image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue]
+ image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std # pyright: ignore[reportAttributeAccessIssue]
+ else:
+ image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+ image_std = args.image_std if args.image_std is not None else default_image_std
+ fout.add_array("clip.vision.image_mean", image_mean)
+ fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = v_hparams["hidden_act"] == "gelu"
+fout.add_bool("clip.use_gelu", use_gelu)
+
+
+if has_llava_projector:
+ # By default, we drop the last layer for llava projector
+ # models unless we have explicitly set vision feature layers
+ if feature_layers is None:
+ model.vision_model.encoder.layers.pop(-1)
+ else:
+ model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
+
+ projector = torch.load(args.llava_projector)
+ for name, data in projector.items():
+ name = get_tensor_name(name)
+ # pw and dw conv ndim==4
+ if data.ndim == 2 or data.ndim == 4:
+ data = data.squeeze().numpy().astype(np.float16)
+ else:
+ data = data.squeeze().numpy().astype(np.float32)
+
+ fout.add_tensor(name, data)
+
+ print("Projector tensors added\n")
+
+state_dict = model.state_dict()
+for name, data in state_dict.items():
+ if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
+ # we don't need this
+ print(f"skipping parameter: {name}")
+ continue
+
+ name = get_tensor_name(name)
+ data = data.squeeze().numpy()
+
+ n_dims = len(data.shape)
+
+ # ftype == 0 -> float32, ftype == 1 -> float16
+ ftype_cur = 0
+ if n_dims == 4:
+ print(f"tensor {name} is always saved in f16")
+ data = data.astype(np.float16)
+ ftype_cur = 1
+ elif ftype == 1:
+ if name[-7:] == ".weight" and n_dims == 2:
+ print(" Converting to float16")
+ data = data.astype(np.float16)
+ ftype_cur = 1
+ else:
+ print(" Converting to float32")
+ data = data.astype(np.float32)
+ ftype_cur = 0
+ else:
+ if data.dtype != np.float32:
+ print(" Converting to float32")
+ data = data.astype(np.float32)
+ ftype_cur = 0
+
+ print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+ fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py b/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py
new file mode 100644
index 0000000..848ef1c
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py
@@ -0,0 +1,280 @@
+import argparse
+import os
+import json
+import re
+
+import torch
+import numpy as np
+from gguf import *
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+from transformers import SiglipVisionModel, SiglipVisionConfig
+
+def k(raw_key: str, arch: str) -> str:
+ return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+ if name in (
+ "logit_scale",
+ "text_model.embeddings.position_ids",
+ "vision_model.embeddings.position_ids",
+ ):
+ return True
+
+ if name in (
+ "vision_model.head.probe",
+ "vision_model.head.attention.in_proj_weight",
+ "vision_model.head.attention.in_proj_bias",
+ "vision_model.head.attention.out_proj.weight",
+ "vision_model.head.attention.out_proj.bias",
+ "vision_model.head.layernorm.weight",
+ "vision_model.head.layernorm.bias",
+ "vision_model.head.mlp.fc1.weight",
+ "vision_model.head.mlp.fc1.bias",
+ "vision_model.head.mlp.fc2.weight",
+ "vision_model.head.mlp.fc2.bias"
+ ):
+ return True
+
+ if name.startswith("v") and not has_vision:
+ return True
+
+ if name.startswith("t") and not has_text:
+ return True
+
+ return False
+
+
+def get_tensor_name(name: str) -> str:
+ if "projection" in name:
+ return name
+ if "mm_projector" in name:
+ name = name.replace("model.mm_projector", "mm")
+ name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+ name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+ return name
+
+ return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+ """
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
+ The reversible bpe codes work on unicode strings.
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+ This is a significant percentage of your normal, say, 32K bpe vocab.
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
+ """
+ bs = (
+ list(range(ord("!"), ord("~") + 1))
+ + list(range(ord("¡"), ord("¬") + 1))
+ + list(range(ord("®"), ord("ÿ") + 1))
+ )
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8 + n)
+ n += 1
+ cs = [chr(n) for n in cs]
+ return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+ help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+ help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+ help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+ help="The clip model is from openclip (for ViT-SO400M type))")
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2","adapter"], default="adapter")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.5, 0.5, 0.5]
+default_image_std = [0.5, 0.5, 0.5]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+ print("--text-only and --image-only arguments cannot be specified at the same time.")
+ exit(1)
+
+if args.use_f32:
+ print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+ vocab = None
+ tokens = None
+else:
+ with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+ vocab = json.load(f)
+ tokens = [key for key in vocab]
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+ config = json.load(f)
+ if args.clip_model_is_vision:
+ v_hparams = config
+ t_hparams = None
+ else:
+ v_hparams = config["vision_config"]
+ t_hparams = None
+
+# possible data types
+# ftype == 0 -> float32
+# ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+ ftype = 0
+
+vision_config = SiglipVisionConfig(**v_hparams)
+model = SiglipVisionModel(vision_config)
+model.load_state_dict(torch.load(os.path.join(dir_model, "glm.clip")))
+
+fname_middle = None
+has_text_encoder = False
+has_vision_encoder = True
+has_glm_projector = True
+if args.text_only:
+ fname_middle = "text-"
+ has_vision_encoder = False
+elif args.llava_projector is not None:
+ fname_middle = "mmproj-"
+ has_text_encoder = False
+ has_glm_projector = True
+elif args.vision_only:
+ fname_middle = "vision-"
+ has_text_encoder = False
+else:
+ fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_glm_projector", has_glm_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if has_glm_projector:
+ fout.add_description("image encoder for glm4v")
+ fout.add_string("clip.projector_type", "adapter")
+else:
+ fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+ assert t_hparams is not None
+ assert tokens is not None
+ # text_model hparams
+ fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+ fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+ fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+ fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+ fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+ fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+ fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+ fout.add_token_list(tokens)
+
+if has_vision_encoder:
+ # vision_model hparams
+ fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+ fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+ fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+ fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+ fout.add_uint32("clip.vision.projection_dim", 0)
+ fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+ fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+ fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), v_hparams["num_hidden_layers"])
+
+ image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+ image_std = args.image_std if args.image_std is not None else default_image_std
+ fout.add_array("clip.vision.image_mean", image_mean)
+ fout.add_array("clip.vision.image_std", image_std)
+
+fout.add_bool("clip.use_gelu", True)
+
+
+if has_glm_projector:
+ # model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue]
+ projector = torch.load(args.llava_projector)
+ for name, data in projector.items():
+ name = get_tensor_name(name)
+ # pw and dw conv ndim==4
+ if data.ndim == 2 or data.ndim == 4:
+ data = data.squeeze().numpy().astype(np.float16)
+ else:
+ data = data.squeeze().numpy().astype(np.float32)
+ if name.startswith("vision."):
+ name=name.replace("vision.","")
+ fout.add_tensor(name, data)
+ print(f"Projector {name} - {data.dtype} - shape = {data.shape}")
+ # print(f"Projector {name} tensors added\n")
+
+state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue]
+for name, data in state_dict.items():
+ if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_glm_projector):
+ # we don't need this
+ print(f"skipping parameter: {name}")
+ continue
+
+ name = get_tensor_name(name)
+ data = data.squeeze().numpy()
+
+ n_dims = len(data.shape)
+
+ # ftype == 0 -> float32, ftype == 1 -> float16
+ ftype_cur = 0
+ if n_dims == 4:
+ print(f"tensor {name} is always saved in f16")
+ data = data.astype(np.float16)
+ ftype_cur = 1
+ elif ftype == 1:
+ if name[-7:] == ".weight" and n_dims == 2:
+ # print(" Converting to float16")
+ data = data.astype(np.float16)
+ ftype_cur = 1
+ else:
+ # print(" Converting to float32")
+ data = data.astype(np.float32)
+ ftype_cur = 0
+ else:
+ if data.dtype != np.float32:
+ # print(" Converting to float32")
+ data = data.astype(np.float32)
+ ftype_cur = 0
+ print(f"siglip {name} - {data.dtype} - shape = {data.shape}")
+ # print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+ fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py b/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py
new file mode 100644
index 0000000..16bb915
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py
@@ -0,0 +1,33 @@
+import argparse
+import os
+import torch
+from transformers import AutoModel
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to GLM model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
+checkpoint = model.state_dict()
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.adapter.")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/glm.projector")
+
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.vit.model.vision_model.")]
+if len(clip_tensors) > 0:
+ clip = {name.replace("vision.vit.model.", ""): checkpoint[name].float() for name in clip_tensors}
+ torch.save(clip, f"{args.model}/glm.clip")
+
+ # added tokens should be removed to be able to convert Mistral models
+ if os.path.exists(f"{args.model}/added_tokens.json"):
+ with open(f"{args.model}/added_tokens.json", "w") as f:
+ f.write("{}\n")
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}glm.projector to prepare a glm-encoder.gguf file.")
diff --git a/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py b/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py
new file mode 100644
index 0000000..4f2da3b
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py
@@ -0,0 +1,38 @@
+import argparse
+import glob
+import os
+import torch
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1]
+checkpoint = torch.load(path)
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/llava.projector")
+
+# BakLLaVA models contain CLIP tensors in it
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
+if len(clip_tensors) > 0:
+ clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
+ torch.save(clip, f"{args.model}/llava.clip")
+
+
+ # added tokens should be removed to be able to convert Mistral models
+ if os.path.exists(f"{args.model}/added_tokens.json"):
+ with open(f"{args.model}/added_tokens.json", "w") as f:
+ f.write("{}\n")
+
+
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py b/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py
new file mode 100644
index 0000000..b07c3e3
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py
@@ -0,0 +1,180 @@
+import argparse
+import glob
+import os
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_file
+from typing import Any, ContextManager, cast
+
+# Function to determine if file is a SafeTensor file
+def is_safetensor_file(file_path):
+ return file_path.endswith('.safetensors')
+
+
+# Unified loading function
+def load_model(file_path):
+ if is_safetensor_file(file_path):
+ tensors = {}
+ with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
+ for key in f.keys():
+ tensors[key] = f.get_tensor(key).clone()
+ # output shape
+ print(f"{key} : {tensors[key].shape}")
+ return tensors, 'safetensor'
+ else:
+ return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
+
+
+# Unified saving function
+def save_model(model, file_path, file_type):
+ if file_type == 'safetensor':
+ # safe_save(model, file_path)
+ save_file(model, file_path)
+ else:
+ torch.save(model, file_path)
+
+# Helpers to match weight names from specific components or
+# determine if a saved shard contains that component
+def is_vision_tower(weight_name):
+ return (
+ weight_name.startswith("model.vision_tower") or
+ weight_name.startswith("vit.") or
+ weight_name.startswith("vision_tower")
+ )
+
+def is_newline(weight_name):
+ return (
+ weight_name.startswith("model.image_newline") or
+ weight_name.startswith("image_newline")
+ )
+
+def is_mm_projector(weight_name):
+ return (
+ weight_name.startswith("model.mm_projector") or
+ weight_name.startswith("vision_proj.") or
+ weight_name.startswith("multi_modal_projector")
+ )
+
+def newline_criteria(checkpoint):
+ return any(is_newline(k) for k in checkpoint.keys())
+
+def proj_criteria(checkpoint):
+ return any(is_mm_projector(k) for k in checkpoint.keys())
+
+# Adapted function to clean vision tower from checkpoint
+def clean_vision_tower_from_checkpoint(checkpoint_path):
+ checkpoint, file_type = load_model(checkpoint_path)
+ # file_type = 'pytorch'
+ model_path = os.path.dirname(checkpoint_path)
+ print(f"Searching for vision tower tensors in {checkpoint_path}")
+ clip_tensors = [k for k, v in checkpoint.items() if is_vision_tower(k)]
+
+ if len(clip_tensors) > 0:
+ print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
+ # Adapted for file type
+ clip_path = os.path.join(model_path, "llava.clip")
+
+ if os.path.exists(clip_path):
+ print(f"Loading existing llava.clip from {clip_path}")
+ existing_clip, _ = load_model(clip_path)
+ else:
+ print(f"Creating new llava.clip at {clip_path}")
+ existing_clip = {}
+ # Update existing_clip with new tensors, avoid duplicates
+ for name in clip_tensors:
+ simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
+ print(f"Adding {simple_name} to llava.clip")
+ if simple_name not in existing_clip:
+ existing_clip[simple_name] = checkpoint[name]
+
+ # Save the updated clip tensors back to llava.clip
+ save_model(existing_clip, clip_path, 'pytorch')
+
+ # Remove the tensors from the original checkpoint
+ for name in clip_tensors:
+ del checkpoint[name]
+
+ checkpoint_path = checkpoint_path
+ return True
+ return False
+
+def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
+ newline_checkpoint_path = None
+ projector_checkpoint_path = None
+
+ for path in checkpoint_paths:
+ checkpoint, _ = load_model(path)
+ if newline_criteria(checkpoint) and newline_checkpoint_path is None:
+ newline_checkpoint_path = path
+ if projector(checkpoint):
+ projector_checkpoint_path = path
+
+ return newline_checkpoint_path, projector_checkpoint_path
+
+
+# Command-line interface setup
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
+ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
+args = ap.parse_args()
+
+if args.clean_vision_tower:
+ # Generalized to handle both PyTorch and SafeTensors models
+ model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+ # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
+ checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+ for projector_checkpoint_path in checkpoint_paths:
+ print(f"Cleaning {projector_checkpoint_path}")
+ if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
+ print(f"No vision tower found in {projector_checkpoint_path}")
+ # we break once none is found, so far all models append them at the end
+ # break
+ print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
+
+# Now we look for the projector in the last checkpoint
+model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+# last_checkpoint_path = checkpoint_paths[0]
+# first_checkpoint_path = checkpoint_paths[-1]
+newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
+
+print(f"Taking projector from {projector_checkpoint_path}")
+first_mm_tensors = []
+first_checkpoint = None
+if newline_checkpoint_path is not None:
+ print(f"Taking newline from {newline_checkpoint_path}")
+ first_checkpoint, file_type = load_model(newline_checkpoint_path)
+ first_mm_tensors = [k for k, v in first_checkpoint.items() if is_newline(k)]
+
+# Load the checkpoint
+mm_tensors = []
+last_checkpoint = None
+if projector_checkpoint_path is not None:
+ last_checkpoint, file_type = load_model(projector_checkpoint_path)
+ mm_tensors = [k for k, v in last_checkpoint.items() if is_mm_projector(k)]
+
+if len(mm_tensors) == 0:
+ if last_checkpoint is not None:
+ for k, v in last_checkpoint.items():
+ print(k)
+ print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
+ print("No tensors found. Is this a LLaVA model?")
+ exit()
+
+print(f"Found {len(mm_tensors)} tensors to extract.")
+print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
+# projector = {name: checkpoint.[name].float() for name in mm_tensors}
+projector = {}
+for name in mm_tensors:
+ assert last_checkpoint is not None
+ projector[name] = last_checkpoint[name].float()
+for name in first_mm_tensors:
+ assert first_checkpoint is not None
+ projector[name] = first_checkpoint[name].float()
+
+if len(projector) > 0:
+ save_model(projector, f"{args.model}/llava.projector", 'pytorch')
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
new file mode 100644
index 0000000..944037e
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
@@ -0,0 +1,892 @@
+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Siglip model. """
+# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
+
+
+import os
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import (
+ logging,
+)
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+class SiglipVisionConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+ Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+ [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+ Args:
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ num_channels (`int`, *optional*, defaults to 3):
+ Number of channels in the input images.
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 16):
+ The size (resolution) of each patch.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+ The epsilon used by the layer normalization layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ Example:
+ ```python
+ >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+ >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+ >>> configuration = SiglipVisionConfig()
+ >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+ >>> model = SiglipVisionModel(configuration)
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "siglip_vision_model"
+
+ def __init__(
+ self,
+ hidden_size=768,
+ intermediate_size=3072,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ num_channels=3,
+ image_size=224,
+ patch_size=16,
+ hidden_act="gelu_pytorch_tanh",
+ layer_norm_eps=1e-6,
+ attention_dropout=0.0,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.num_channels = num_channels
+ self.patch_size = patch_size
+ self.image_size = image_size
+ self.attention_dropout = attention_dropout
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_act = hidden_act
+
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+ "google/siglip-base-patch16-224",
+ # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+ # Cut & paste from PyTorch official master until it's in a few official releases - RW
+ # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+ def norm_cdf(x):
+ # Computes standard normal cumulative distribution function
+ return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+ if (mean < a - 2 * std) or (mean > b + 2 * std):
+ warnings.warn(
+ "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+ "The distribution of values may be incorrect.",
+ stacklevel=2,
+ )
+
+ # Values are generated by using a truncated uniform distribution and
+ # then using the inverse CDF for the normal distribution.
+ # Get upper and lower cdf values
+ l = norm_cdf((a - mean) / std)
+ u = norm_cdf((b - mean) / std)
+
+ # Uniformly fill tensor with values from [l, u], then translate to
+ # [2l-1, 2u-1].
+ tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+ # Use inverse cdf transform for normal distribution to get truncated
+ # standard normal
+ if tensor.dtype in [torch.float16, torch.bfloat16]:
+ # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+ og_dtype = tensor.dtype
+ tensor = tensor.to(torch.float32)
+ tensor.erfinv_()
+ tensor = tensor.to(og_dtype)
+ else:
+ tensor.erfinv_()
+
+ # Transform to proper mean, std
+ tensor.mul_(std * math.sqrt(2.0))
+ tensor.add_(mean)
+
+ # Clamp to ensure it's in the proper range
+ if tensor.dtype == torch.float16:
+ # The `clamp_` op is not (yet?) defined in float16+cpu
+ tensor = tensor.to(torch.float32)
+ tensor.clamp_(min=a, max=b)
+ tensor = tensor.to(torch.float16)
+ else:
+ tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+ tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+):
+ """Fills the input Tensor with values drawn from a truncated
+ normal distribution. The values are effectively drawn from the
+ normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+ with values outside :math:`[a, b]` redrawn until they are within
+ the bounds. The method used for generating the random values works
+ best when :math:`a \\leq \text{mean} \\leq b`.
+ NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+ bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+ and the result is subsquently scaled and shifted by the mean and std args.
+ Args:
+ tensor: an n-dimensional `torch.Tensor`
+ mean: the mean of the normal distribution
+ std: the standard deviation of the normal distribution
+ a: the minimum cutoff value
+ b: the maximum cutoff value
+ """
+ with torch.no_grad():
+ _trunc_normal_(tensor, 0, 1.0, a, b)
+ tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+ denom = fan_in
+ if mode == "fan_in":
+ denom = fan_in
+ elif mode == "fan_out":
+ denom = fan_out
+ elif mode == "fan_avg":
+ denom = (fan_in + fan_out) / 2
+
+ variance = scale / denom
+
+ if distribution == "truncated_normal":
+ # constant is stddev of standard normal truncated to (-2, 2)
+ trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+ elif distribution == "normal":
+ with torch.no_grad():
+ tensor.normal_(std=math.sqrt(variance))
+ elif distribution == "uniform":
+ bound = math.sqrt(3 * variance)
+ with torch.no_grad():
+ tensor.uniform_(-bound, bound)
+ else:
+ raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+ variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+ variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+class SiglipVisionEmbeddings(nn.Module):
+ def __init__(self, config: SiglipVisionConfig):
+ super().__init__()
+ self.config = config
+ self.embed_dim = config.hidden_size
+ self.image_size = config.image_size
+ self.patch_size = config.patch_size
+
+ self.patch_embedding = nn.Conv2d(
+ in_channels=config.num_channels,
+ out_channels=self.embed_dim,
+ kernel_size=self.patch_size,
+ stride=self.patch_size,
+ padding="valid",
+ )
+
+ self.num_patches_per_side = self.image_size // self.patch_size
+ self.num_patches = self.num_patches_per_side**2
+ self.num_positions = self.num_patches
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+class SiglipAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.embed_dim = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.embed_dim // self.num_heads
+ if self.head_dim * self.num_heads != self.embed_dim:
+ raise ValueError(
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+ f" {self.num_heads})."
+ )
+ self.scale = self.head_dim**-0.5
+ self.dropout = config.attention_dropout
+
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.activation_fn = ACT2FN[config.hidden_act]
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+ def __init__(self, config: SiglipVisionConfig):
+ super().__init__()
+ self.embed_dim = config.hidden_size
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+ self.self_attn = (
+ SiglipAttention(config)
+ )
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+ self.mlp = SiglipMLP(config)
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+class SiglipPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = SiglipVisionConfig
+ base_model_prefix = "siglip"
+ supports_gradient_checkpointing = True
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+
+ if isinstance(module, SiglipVisionEmbeddings):
+ width = self.config.hidden_size
+ nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+ elif isinstance(module, nn.Embedding):
+ default_flax_embed_init(module.weight)
+ elif isinstance(module, SiglipAttention):
+ nn.init.normal_(module.q_proj.weight)
+ nn.init.normal_(module.k_proj.weight)
+ nn.init.normal_(module.v_proj.weight)
+ nn.init.normal_(module.out_proj.weight)
+ nn.init.zeros_(module.q_proj.bias)
+ nn.init.zeros_(module.k_proj.bias)
+ nn.init.zeros_(module.v_proj.bias)
+ nn.init.zeros_(module.out_proj.bias)
+ elif isinstance(module, SiglipMLP):
+ nn.init.normal_(module.fc1.weight)
+ nn.init.normal_(module.fc2.weight)
+ nn.init.normal_(module.fc1.bias, std=1e-6)
+ nn.init.normal_(module.fc2.bias, std=1e-6)
+ elif isinstance(module, (nn.Linear, nn.Conv2d)):
+ lecun_normal_(module.weight)
+ if module.bias is not None:
+ nn.init.zeros_(module.bias)
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+
+
+SIGLIP_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+ Parameters:
+ config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+ """
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+ [`SiglipEncoderLayer`].
+ Args:
+ config: SiglipConfig
+ """
+
+ def __init__(self, config: SiglipVisionConfig):
+ super().__init__()
+ self.config = config
+ self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+class SiglipVisionTransformer(SiglipPreTrainedModel):
+ config_class = SiglipVisionConfig
+ main_input_name = "pixel_values"
+ _supports_flash_attn_2 = True
+
+ def __init__(self, config: SiglipVisionConfig):
+ super().__init__(config)
+ self.config = config
+ embed_dim = config.hidden_size
+
+ self.embeddings = SiglipVisionEmbeddings(config)
+ self.encoder = SiglipEncoder(config)
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Module:
+ return self.embeddings.patch_embedding
+
+import argparse
+import json
+import re
+
+import numpy as np
+from gguf import *
+from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
+from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+
+
+def add_key_str(raw_key: str, arch: str) -> str:
+ return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
+ if name in (
+ "logit_scale",
+ "text_model.embeddings.position_ids",
+ "vision_model.embeddings.position_ids",
+ ):
+ return True
+
+ if has_minicpmv and name in ["visual_projection.weight"]:
+ return True
+
+ if name.startswith("v") and not has_vision:
+ return True
+
+ if name.startswith("t") and not has_text:
+ return True
+
+ return False
+
+
+def get_tensor_name(name: str) -> str:
+ if "projection" in name:
+ return name
+ if "mm_projector" in name:
+ name = name.replace("model.mm_projector", "mm")
+ name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+ name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+ return name
+
+ return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+ """
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
+ The reversible bpe codes work on unicode strings.
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+ This is a significant percentage of your normal, say, 32K bpe vocab.
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
+ """
+ bs = (
+ list(range(ord("!"), ord("~") + 1))
+ + list(range(ord("¡"), ord("¬") + 1))
+ + list(range(ord("®"), ord("ÿ") + 1))
+ )
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8 + n)
+ n += 1
+ cs = [chr(n) for n in cs]
+ return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+ help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+ help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+ help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+ help="The clip model is from openclip (for ViT-SO400M type))")
+ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.5, 0.5, 0.5]
+default_image_std = [0.5, 0.5, 0.5]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6; MiniCPM-o-4.5 use 100045', default=2)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+ print("--text-only and --image-only arguments cannot be specified at the same time.")
+ exit(1)
+
+if args.use_f32:
+ print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+# Read config.json to get actual model configuration
+config_path = os.path.join(dir_model, "config.json")
+model_config = {}
+if os.path.isfile(config_path):
+ with open(config_path, "r", encoding="utf-8") as f:
+ model_config = json.load(f)
+ print(f"Loaded config from {config_path}")
+else:
+ print(f"Warning: config.json not found at {config_path}")
+
+# If minicpmv_projector is not specified but the default path exists, use the default path
+if args.minicpmv_projector is None:
+ default_projector_path = os.path.join(dir_model, "minicpmv.projector")
+ if os.path.isfile(default_projector_path):
+ args.minicpmv_projector = default_projector_path
+ print(f"Found default projector file: {default_projector_path}")
+
+# If output_dir is not specified, use model_dir as the default value
+if args.output_dir is None:
+ args.output_dir = dir_model
+
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+ vocab = None
+ tokens = None
+else:
+ with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+ vocab = json.load(f)
+ tokens = [key for key in vocab]
+
+# possible data types
+# ftype == 0 -> float32
+# ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+ ftype = 0
+
+# if args.clip_model_is_vision or args.clip_model_is_openclip:
+# model = CLIPVisionModel.from_pretrained(dir_model)
+# processor = None
+# else:
+# model = CLIPModel.from_pretrained(dir_model)
+# processor = CLIPProcessor.from_pretrained(dir_model)
+
+minicpmv_version = args.minicpmv_version
+
+# Use actual config values instead of hardcoded ones
+if model_config:
+ # For the projector/resampler, use the main model's hidden_size
+ emb_dim = model_config.get("hidden_size", 1536)
+
+ # For the vision model, use vision_config values
+ vision_config_dict = model_config.get("vision_config", {})
+ default_vision_config = {
+ "hidden_size": vision_config_dict.get("hidden_size", 1152),
+ "image_size": vision_config_dict.get("image_size", 980),
+ "intermediate_size": vision_config_dict.get("intermediate_size", 4304),
+ "model_type": vision_config_dict.get("model_type", "siglip"),
+ "num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
+ "num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
+ "patch_size": vision_config_dict.get("patch_size", 14),
+ }
+
+ # Use vision model's num_hidden_layers for block_count
+ block_count = vision_config_dict.get("num_hidden_layers", 27)
+
+ print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
+ print(f"Vision config: {default_vision_config}")
+else:
+ # Fallback to original hardcoded logic if config.json not found
+ emb_dim = 4096
+ block_count = 26
+ if minicpmv_version == 1:
+ emb_dim = 2304
+ block_count = 26
+ elif minicpmv_version == 2:
+ emb_dim = 4096
+ block_count = 27
+ elif minicpmv_version == 3:
+ emb_dim = 3584
+ block_count = 27
+ elif minicpmv_version == 4:
+ emb_dim = 3584
+ block_count = 27
+ elif minicpmv_version == 5:
+ emb_dim = 2560
+ block_count = 27
+ elif minicpmv_version == 6:
+ emb_dim = 4096
+ block_count = 27
+ elif minicpmv_version == 100045:
+ emb_dim = 4096
+ block_count = 27
+
+ default_vision_config = {
+ "hidden_size": 1152,
+ "image_size": 980,
+ "intermediate_size": 4304,
+ "model_type": "idefics2",
+ "num_attention_heads": 16,
+ "num_hidden_layers": 27,
+ "patch_size": 14,
+ }
+
+vision_config = Idefics2VisionConfig(**default_vision_config)
+model = Idefics2VisionTransformer(vision_config)
+if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
+ vision_config = SiglipVisionConfig(**default_vision_config)
+ model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 4:
+ vision_config = SiglipVisionConfig(**default_vision_config)
+ model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 5:
+ default_vision_config["model_type"] = "siglip_vision_model"
+ vision_config = SiglipVisionConfig(**default_vision_config)
+ model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 6:
+ default_vision_config["model_type"] = "siglip_vision_model"
+ vision_config = SiglipVisionConfig(**default_vision_config)
+ model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 100045:
+ default_vision_config["model_type"] = "siglip_vision_model"
+ vision_config = SiglipVisionConfig(**default_vision_config)
+ model = SiglipVisionTransformer(vision_config)
+
+processor = None
+# if model.attn_pool is not None:
+# model.attn_pool = torch.nn.Identity()
+
+# model.blocks = model.blocks[:-1]
+model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_minicpmv_projector = False
+
+if args.text_only:
+ fname_middle = "text-"
+ has_vision_encoder = False
+elif args.minicpmv_projector is not None:
+ fname_middle = "mmproj-"
+ has_text_encoder = False
+ has_minicpmv_projector = True
+elif args.vision_only:
+ fname_middle = "vision-"
+ has_text_encoder = False
+else:
+ fname_middle = ""
+
+output_dir = args.output_dir
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
+fout.add_file_type(ftype)
+if args.text_only:
+ fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_minicpmv_projector:
+ fout.add_description("vision-only CLIP model")
+elif has_minicpmv_projector:
+ fout.add_description("image encoder for MiniCPM-V")
+ # add projector type
+ fout.add_string("clip.projector_type", "resampler")
+ fout.add_int32("clip.minicpmv_version", minicpmv_version)
+else:
+ fout.add_description("two-tower CLIP model")
+
+if has_vision_encoder:
+ # vision_model hparams - use actual config values
+ vision_image_size = model_config.get("image_size", 448) if model_config else 448
+ vision_patch_size = default_vision_config.get("patch_size", 14)
+ vision_hidden_size = default_vision_config.get("hidden_size", 1152)
+ vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
+ vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
+
+ fout.add_uint32("clip.vision.image_size", vision_image_size)
+ fout.add_uint32("clip.vision.patch_size", vision_patch_size)
+ fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
+ fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
+ fout.add_uint32("clip.vision.projection_dim", 0)
+ fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
+ fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+ fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
+
+ # Add MiniCPM-V specific parameters
+ query_num = model_config.get("query_num", 0) if model_config else 0
+ resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
+ fout.add_uint32("clip.minicpmv_query_num", query_num)
+
+ if processor is not None:
+ image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
+ image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+ else:
+ image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+ image_std = args.image_std if args.image_std is not None else default_image_std
+ fout.add_array("clip.vision.image_mean", image_mean)
+ fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = True
+fout.add_bool("clip.use_gelu", use_gelu)
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+ """
+ embed_dim: output dimension for each position
+ pos: a list of positions to be encoded: size (M,)
+ out: (M, D)
+ """
+ assert embed_dim % 2 == 0
+ omega = np.arange(embed_dim // 2, dtype=np.float32)
+ omega /= embed_dim / 2.
+ omega = 1. / 10000 ** omega # (D/2,)
+
+ pos = pos.reshape(-1) # (M,)
+ out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
+
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
+ return emb
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+ assert embed_dim % 2 == 0
+
+ # use half of dimensions to encode grid_h
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
+
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+ return emb
+
+
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+ """
+ grid_size: int of the grid height and width
+ return:
+ pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ if isinstance(grid_size, int):
+ grid_h_size, grid_w_size = grid_size, grid_size
+ else:
+ grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+
+ grid_h = np.arange(grid_h_size, dtype=np.float32)
+ grid_w = np.arange(grid_w_size, dtype=np.float32)
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+ if cls_token:
+ pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+def _replace_name_resampler(s, v):
+ if re.match("resampler.pos_embed", s):
+ return {
+ s: v,
+ re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
+ }
+ if re.match("resampler.proj", s):
+ return {
+ re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
+ re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
+ }
+ if re.match("resampler.attn.in_proj_.*", s):
+ return {
+ re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
+ re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
+ re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
+ }
+ return {s: v}
+
+if has_minicpmv_projector:
+ projector = torch.load(args.minicpmv_projector)
+ new_state_dict = {}
+ for k, v in projector.items():
+ kvs = _replace_name_resampler(k, v)
+ for nk, nv in kvs.items():
+ new_state_dict[nk] = nv
+ projector = new_state_dict
+ ftype_cur = 0
+ for name, data in projector.items():
+ name = get_tensor_name(name)
+ data = data.squeeze().numpy()
+
+ n_dims = len(data.shape)
+ if ftype == 1:
+ if name[-7:] == ".weight" and n_dims == 2:
+ print(" Converting to float16")
+ data = data.astype(np.float16)
+ ftype_cur = 1
+ else:
+ print(" Converting to float32")
+ data = data.astype(np.float32)
+ ftype_cur = 0
+ else:
+ if data.dtype != np.float32:
+ print(" Converting to float32")
+ data = data.astype(np.float32)
+ ftype_cur = 0
+
+ fout.add_tensor(name, data)
+ print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+
+ print("Projector tensors added\n")
+
+def _replace_name(s, v):
+ s = "vision_model." + s
+ if re.match("vision_model.embeddings.position_embedding", s):
+ v = v.unsqueeze(0)
+ return {s: v}
+
+ return {s: v}
+
+state_dict = model.state_dict()
+new_state_dict = {}
+for k, v in state_dict.items():
+ kvs = _replace_name(k, v)
+ for nk, nv in kvs.items():
+ new_state_dict[nk] = nv
+state_dict = new_state_dict
+for name, data in state_dict.items():
+ if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
+ # we don't need this
+ print(f"skipping parameter: {name}")
+ continue
+
+ name = get_tensor_name(name)
+ data = data.squeeze().numpy()
+
+ n_dims = len(data.shape)
+
+ # ftype == 0 -> float32, ftype == 1 -> float16
+ ftype_cur = 0
+ if n_dims == 4:
+ print(f"tensor {name} is always saved in f16")
+ data = data.astype(np.float16)
+ ftype_cur = 1
+ elif ftype == 1:
+ if name[-7:] == ".weight" and n_dims == 2:
+ print(" Converting to float16")
+ data = data.astype(np.float16)
+ ftype_cur = 1
+ else:
+ print(" Converting to float32")
+ data = data.astype(np.float32)
+ ftype_cur = 0
+ else:
+ if data.dtype != np.float32:
+ print(" Converting to float32")
+ data = data.astype(np.float32)
+ ftype_cur = 0
+
+ print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+ fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py b/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py
new file mode 100644
index 0000000..5352662
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py
@@ -0,0 +1,47 @@
+import argparse
+import os
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16)
+checkpoint = model.state_dict()
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True:
+ projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb
+torch.save(projector, f"{args.model}/minicpmv.projector")
+
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
+if len(clip_tensors) > 0:
+ clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
+ torch.save(clip, f"{args.model}/minicpmv.clip")
+
+ # added tokens should be removed to be able to convert Mistral models
+ if os.path.exists(f"{args.model}/added_tokens.json"):
+ with open(f"{args.model}/added_tokens.json", "w") as f:
+ f.write("{}\n")
+
+config = model.llm.config
+config.auto_map = {
+ "AutoConfig": "configuration_minicpm.MiniCPMConfig",
+ "AutoModel": "modeling_minicpm.MiniCPMModel",
+ "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
+ "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
+ "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
+}
+model.llm.save_pretrained(f"{args.model}/model")
+tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+tok.save_pretrained(f"{args.model}/model")
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.")
diff --git a/llama.cpp/tools/mtmd/models/cogvlm.cpp b/llama.cpp/tools/mtmd/models/cogvlm.cpp
new file mode 100644
index 0000000..d5b739c
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/cogvlm.cpp
@@ -0,0 +1,98 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_cogvlm::build() {
+ GGML_ASSERT(model.class_embedding != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+
+ const int n_pos = n_patches + 1; // +1 for [CLS]
+
+ // build input and concatenate class embedding
+ ggml_tensor * inp = build_inp();
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+ inp = ggml_add(ctx0, inp, model.position_embeddings);
+ cb(inp, "inp_pos", -1);
+
+ ggml_tensor * inpL = inp;
+
+ for (int il = 0; il < n_layer; il++) {
+ auto & layer = model.layers[il];
+ ggml_tensor * cur = inpL;
+
+ cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+
+ cur = ggml_add(ctx0, cur, layer.qkv_b);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+ cur->nb[1], 0);
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+ cur->nb[1], n_embd * sizeof(float));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+ cur->nb[1], 2 * n_embd * sizeof(float));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+ cb(cur, "attn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ inpL = cur;
+
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ layer.ff_gate_w, layer.ff_gate_b,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ cb(cur, "ffn_out", il);
+
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+ cb(cur, "ffn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ cb(cur, "layer_out", il);
+ inpL = cur;
+
+ }
+
+ // remove CLS token (like build_llama4 does)
+ ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
+ n_embd, n_patches,
+ ggml_row_size(inpL->type, n_embd), 0);
+
+ // Multiply with mm_model_proj
+ cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+
+ // Apply layernorm, weight, bias
+ cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+
+ // Apply GELU
+ cur = ggml_gelu_inplace(ctx0, cur);
+
+ // Branch 1: multiply with mm_h_to_4h_w
+ ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
+
+ // Branch 2: multiply with mm_gate_w
+ ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
+
+ // Apply silu
+ gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
+
+ // Apply mm_4h_to_h_w
+ cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
+
+ // Concatenate with boi and eoi
+ cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
+ cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/conformer.cpp b/llama.cpp/tools/mtmd/models/conformer.cpp
new file mode 100644
index 0000000..9b1fab4
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/conformer.cpp
@@ -0,0 +1,216 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_conformer::build() {
+ const int n_frames = img.nx;
+ const int n_pos = n_frames / 2;
+ const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
+ GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+ ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
+ ggml_set_name(pos_emb, "pos_emb");
+ ggml_set_input(pos_emb);
+ ggml_build_forward_expand(gf, pos_emb);
+
+ ggml_tensor * inp = build_inp_raw(1);
+
+ auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+
+ // pre encode, conv subsampling
+ {
+ // layer.0 - conv2d
+ cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
+ cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
+ cb(cur, "conformer.pre_encode.conv.{}", 0);
+
+ // layer.1 - relu
+ cur = ggml_relu_inplace(ctx0, cur);
+
+ // layer.2 conv2d dw
+ cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
+ cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
+ cb(cur, "conformer.pre_encode.conv.{}", 2);
+
+ // layer.3 conv2d
+ cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
+ cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
+ cb(cur, "conformer.pre_encode.conv.{}", 3);
+
+ // layer.4 - relu
+ cur = ggml_relu_inplace(ctx0, cur);
+
+ // layer.5 conv2d dw
+ cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
+ cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
+ cb(cur, "conformer.pre_encode.conv.{}", 5);
+
+ // layer.6 conv2d
+ cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
+ cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
+ cb(cur, "conformer.pre_encode.conv.{}", 6);
+
+ // layer.7 - relu
+ cur = ggml_relu_inplace(ctx0, cur);
+
+ // flatten channel and frequency axis
+ cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
+
+ // calculate out
+ cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
+ cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
+ cb(cur, "conformer.pre_encode.out", -1);
+ }
+
+ // pos_emb
+ cb(pos_emb, "pos_emb", -1);
+
+ for (int il = 0; il < hparams.n_layer; il++) {
+ const auto & layer = model.layers[il];
+
+ auto * residual = cur;
+
+ cb(cur, "layer.in", il);
+
+ // feed_forward1
+ cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
+ cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
+
+ cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
+ il);
+ cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
+
+ const auto fc_factor = 0.5f;
+ residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
+
+ // self-attention
+ {
+ cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
+ cb(cur, "conformer.layers.{}.norm_self_att", il);
+
+ ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+ Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
+ ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
+ Q_bias_u = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
+ ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
+ Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
+
+ // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
+ ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+ Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+ ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+ Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
+ Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
+
+ // build_attn won't fit due to matrix_ac and matrix_bd separation
+ ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
+ matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
+ cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
+
+ auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
+ cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
+ p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
+ p = ggml_permute(ctx0, p, 0, 2, 1, 3);
+
+ auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
+ matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
+
+ // rel shift
+ {
+ const auto pos_len = matrix_bd->ne[0];
+ const auto q_len = matrix_bd->ne[1];
+ const auto h = matrix_bd->ne[2];
+ matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
+ matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
+ matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
+ matrix_bd = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
+ matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
+ matrix_bd = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
+ }
+
+ matrix_bd = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
+ matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
+ auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
+ scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
+ cb(scores, "conformer.layers.{}.self_attn.id0", il);
+
+ ggml_tensor * attn = ggml_soft_max(ctx0, scores);
+ ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur);
+ x = ggml_permute(ctx0, x, 2, 0, 1, 3);
+ x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
+
+ ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
+ out = ggml_add(ctx0, out, layer.o_b);
+ cb(out, "conformer.layers.{}.self_attn.linear_out", il);
+
+ cur = out;
+ }
+
+ residual = ggml_add(ctx0, residual, cur);
+ cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
+ cb(cur, "conformer.layers.{}.norm_conv", il);
+
+ // conv
+ {
+ auto * x = cur;
+ x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
+ x = ggml_add(ctx0, x, layer.conv_pw1_b);
+ cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
+
+ // ggml_glu doesn't support sigmoid
+ // TODO @ngxson : support this ops in ggml
+ {
+ int64_t d = x->ne[0] / 2;
+ ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
+ x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
+ x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
+ }
+
+ // use ggml_ssm_conv for f32 precision
+ x = ggml_pad(ctx0, x, 4, 0, 0, 0);
+ x = ggml_roll(ctx0, x, 4, 0, 0, 0);
+ x = ggml_pad(ctx0, x, 4, 0, 0, 0);
+ x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
+ x = ggml_add(ctx0, x, layer.conv_dw_b);
+
+ x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
+ x = ggml_silu(ctx0, x);
+
+ // pointwise_conv2
+ x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
+ x = ggml_add(ctx0, x, layer.conv_pw2_b);
+
+ cur = x;
+ }
+
+ residual = ggml_add(ctx0, residual, cur);
+
+ cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
+ cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
+
+ cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
+ FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams
+ cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
+
+ residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
+ cb(residual, "conformer.layers.{}.conv.id", il);
+
+ cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
+ cb(cur, "conformer.layers.{}.norm_out", il);
+ }
+
+ // audio adapter
+ cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
+ cb(cur, "audio_adapter.model.{}", 0);
+ cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
+
+ cb(cur, "projected", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/glm4v.cpp b/llama.cpp/tools/mtmd/models/glm4v.cpp
new file mode 100644
index 0000000..f39b692
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/glm4v.cpp
@@ -0,0 +1,120 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_glm4v::build() {
+ GGML_ASSERT(model.patch_bias != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+ GGML_ASSERT(model.class_embedding == nullptr);
+
+ const int batch_size = 1;
+
+ norm_type norm_t = NORM_TYPE_RMS;
+
+ ggml_tensor * inp_raw = build_inp_raw();
+ ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+ GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+ // second conv dimension
+ {
+ auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+ inp = ggml_add(ctx0, inp, inp_1);
+
+ inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ inp = ggml_reshape_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+ inp = ggml_cont_3d(
+ ctx0, inp,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ // add patch bias
+ inp = ggml_add(ctx0, inp, model.patch_bias);
+ cb(inp, "patch_bias", -1);
+
+ // pos-conv norm
+ inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
+
+ // calculate absolute position embedding and apply
+ ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
+ learned_pos_embd = ggml_cont_4d(
+ ctx0, learned_pos_embd,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ learned_pos_embd = ggml_reshape_4d(
+ ctx0, learned_pos_embd,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+ learned_pos_embd = ggml_cont_3d(
+ ctx0, learned_pos_embd,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ cb(learned_pos_embd, "learned_pos_embd", -1);
+
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+ return ggml_rope_multi(
+ ctx0, cur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
+ 32768, hparams.rope_theta, 1, 0, 1, 32, 1);
+ };
+
+ ggml_tensor * cur = build_vit(
+ inp, n_patches,
+ norm_t,
+ hparams.ffn_op,
+ learned_pos_embd,
+ add_pos);
+
+ cb(cur, "vit_out", -1);
+ // cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
+
+ // GLM4V projector
+ // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
+
+ // patch merger (downsample)
+ {
+ int n_merge = hparams.n_merge;
+ GGML_ASSERT(n_merge > 0);
+
+ int n_token_out = n_patches / n_merge / n_merge;
+ cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
+ cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
+ cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
+
+ cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
+ }
+
+ // FC projector
+ {
+ cur = ggml_mul_mat(ctx0, model.projection, cur);
+ // default LayerNorm (post_projection_norm)
+ cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+ cur = ggml_gelu_erf(ctx0, cur);
+ cb(cur, "after_fc_proj", -1);
+ }
+
+ // FFN projector
+ {
+ cur = build_ffn(cur,
+ model.mm_ffn_up_w, model.mm_ffn_up_b,
+ model.mm_ffn_gate_w, model.mm_ffn_gate_b,
+ model.mm_ffn_down_w, model.mm_ffn_down_b,
+ hparams.ffn_op, -1);
+ cb(cur, "after_ffn_proj", -1);
+ // cb(ggml_sum(ctx0, cur), "merged_sum", -1);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/internvl.cpp b/llama.cpp/tools/mtmd/models/internvl.cpp
new file mode 100644
index 0000000..9aded3b
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/internvl.cpp
@@ -0,0 +1,69 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_internvl::build() {
+ GGML_ASSERT(model.class_embedding != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+
+ const int n_pos = n_patches + 1;
+ ggml_tensor * inp = build_inp();
+
+ // add CLS token
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+ // The larger models use a different ViT, which uses RMS norm instead of layer norm
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
+ norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
+ ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
+ : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
+
+ ggml_tensor * cur = build_vit(
+ inp, n_pos,
+ norm_t,
+ hparams.ffn_op,
+ model.position_embeddings,
+ nullptr);
+
+ // remove CLS token
+ cur = ggml_view_2d(ctx0, cur,
+ n_embd, n_patches,
+ ggml_row_size(cur->type, n_embd), 0);
+
+ // pixel shuffle
+ {
+ const int scale_factor = model.hparams.n_merge;
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
+ const int height = n_patches_y;
+ const int width = n_patches_x;
+ GGML_ASSERT(scale_factor > 0);
+ cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+ cur = ggml_cont_4d(ctx0, cur,
+ n_embd * scale_factor * scale_factor,
+ height / scale_factor,
+ width / scale_factor,
+ bsz);
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+ // flatten to 2D
+ cur = ggml_cont_2d(ctx0, cur,
+ n_embd * scale_factor * scale_factor,
+ cur->ne[1] * cur->ne[2]);
+ }
+
+ // projector (always using GELU activation)
+ {
+ // projector LayerNorm uses pytorch's default eps = 1e-5
+ // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
+ cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_3_w, model.mm_3_b,
+ FFN_GELU,
+ -1);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/kimik25.cpp b/llama.cpp/tools/mtmd/models/kimik25.cpp
new file mode 100644
index 0000000..cf9f27f
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/kimik25.cpp
@@ -0,0 +1,101 @@
+#include "models.h"
+#include <cstring>
+#include <cmath>
+
+// note: this is similar to clip_graph::resize_position_embeddings, major difference is having
+// the w/h in ne[1] and ne[2] instead of assuming with sqrt. Could try storing the tensor in 2D instead
+// with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3).
+ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) {
+ ggml_tensor * pos_embd = model.position_embeddings;
+ const int height = img.ny / patch_size;
+ const int width = img.nx / patch_size;
+ const uint32_t mode = interpolation_mode;
+
+ GGML_ASSERT(pos_embd);
+
+ const int64_t stored_c = pos_embd->ne[0]; // C = 1152
+ const int64_t orig_w = pos_embd->ne[1]; // W = 64
+ const int64_t orig_h = pos_embd->ne[2]; // H = 64
+
+ GGML_ASSERT(stored_c == n_embd);
+
+ if (height == (int)orig_h && width == (int)orig_w) {
+ // No interpolation needed, just flatten to [C, H*W]
+ return ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);
+ }
+
+ pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3);
+ pos_embd = ggml_interpolate(ctx0, pos_embd, height, width, n_embd, 1, mode);
+ pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3);
+ pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);
+ return pos_embd;
+}
+
+ggml_cgraph * clip_graph_kimik25::build() {
+ ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_h, "pos_h");
+ ggml_set_input(pos_h);
+
+ ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_w, "pos_w");
+ ggml_set_input(pos_w);
+
+ ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC);
+
+ // Kimi-K2.5 uses interleaved 2D RoPE pattern natively, but
+ // Q / K are permuted during conversion to use split format.
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+ cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+ return cur;
+ };
+
+ ggml_tensor * inp = build_inp();
+
+ // I don't know why, but doing this in the build_vit lead to the ggml_add not occurring?
+ // Doing it manually here does work.
+ inp = ggml_add(ctx0, inp, learned_pos_embd);
+
+ ggml_tensor * cur = build_vit(
+ inp, n_patches,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ nullptr,
+ add_pos);
+
+ cb(cur, "vit_out", -1);
+
+ {
+ // patch_merger
+ const int scale_factor = model.hparams.n_merge;
+ cur = build_patch_merge_permute(cur, scale_factor);
+
+ // projection norm
+ int proj_inp_dim = cur->ne[0];
+ int n_merged_patches = cur->ne[1];
+ cur = ggml_view_2d(ctx0, cur,
+ n_embd, n_merged_patches * scale_factor * scale_factor,
+ ggml_row_size(cur->type, n_embd), 0);
+ cur = ggml_norm(ctx0, cur, hparams.eps);
+ cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+ cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+ cur = ggml_view_2d(ctx0, cur,
+ proj_inp_dim, n_merged_patches,
+ ggml_row_size(cur->type, proj_inp_dim), 0);
+ cb(cur, "proj_inp_normed", -1);
+
+ // projection mlp
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
+
+ cb(cur, "proj_out", -1);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/kimivl.cpp b/llama.cpp/tools/mtmd/models/kimivl.cpp
new file mode 100644
index 0000000..0a06f50
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/kimivl.cpp
@@ -0,0 +1,63 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_kimivl::build() {
+ // 2D input positions
+ ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_h, "pos_h");
+ ggml_set_input(pos_h);
+
+ ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_w, "pos_w");
+ ggml_set_input(pos_w);
+
+ ggml_tensor * learned_pos_embd = resize_position_embeddings();
+
+ // build ViT with 2D position embeddings
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+ // first half is X axis and second half is Y axis
+ return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+ };
+
+ ggml_tensor * inp = build_inp();
+ ggml_tensor * cur = build_vit(
+ inp, n_patches,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ learned_pos_embd,
+ add_pos);
+
+ cb(cur, "vit_out", -1);
+
+ {
+ // patch_merger
+ const int scale_factor = model.hparams.n_merge;
+ cur = build_patch_merge_permute(cur, scale_factor);
+
+ // projection norm
+ int proj_inp_dim = cur->ne[0];
+ cur = ggml_view_2d(ctx0, cur,
+ n_embd, cur->ne[1] * scale_factor * scale_factor,
+ ggml_row_size(cur->type, n_embd), 0);
+ cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+ cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+ cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+ cur = ggml_view_2d(ctx0, cur,
+ proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
+ ggml_row_size(cur->type, proj_inp_dim), 0);
+ cb(cur, "proj_inp_normed", -1);
+
+ // projection mlp
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
+ cb(cur, "proj_out", -1);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/llama4.cpp b/llama.cpp/tools/mtmd/models/llama4.cpp
new file mode 100644
index 0000000..30d1df5
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/llama4.cpp
@@ -0,0 +1,96 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_llama4::build() {
+ GGML_ASSERT(model.class_embedding != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+
+ const int n_pos = n_patches + 1; // +1 for [CLS]
+
+ // 2D input positions
+ ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+ ggml_set_name(pos_h, "pos_h");
+ ggml_set_input(pos_h);
+
+ ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+ ggml_set_name(pos_w, "pos_w");
+ ggml_set_input(pos_w);
+
+ ggml_tensor * inp = build_inp_raw();
+
+ // Llama4UnfoldConvolution
+ {
+ ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
+ patch_size, patch_size, 3, n_embd);
+ inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
+ inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+ inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+ cb(inp, "patch_conv", -1);
+ }
+
+ // add CLS token
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+ // build ViT with 2D position embeddings
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+ // first half is X axis and second half is Y axis
+ // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
+ // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
+ return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+ };
+ ggml_tensor * cur = build_vit(
+ inp, n_pos,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ model.position_embeddings,
+ add_pos);
+
+ // remove CLS token
+ cur = ggml_view_2d(ctx0, cur,
+ n_embd, n_patches,
+ ggml_row_size(cur->type, n_embd), 0);
+
+ // pixel shuffle
+ // based on Llama4VisionPixelShuffleMLP
+ // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
+ {
+ const int scale_factor = model.hparams.n_merge;
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
+ GGML_ASSERT(scale_factor > 0);
+ GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
+ cur = ggml_reshape_4d(ctx0, cur,
+ n_embd * scale_factor,
+ n_patches_x / scale_factor,
+ n_patches_y,
+ bsz);
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+ cur = ggml_cont_4d(ctx0, cur,
+ n_embd * scale_factor * scale_factor,
+ n_patches_x / scale_factor,
+ n_patches_y / scale_factor,
+ bsz);
+ //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+ // flatten to 2D
+ cur = ggml_cont_2d(ctx0, cur,
+ n_embd * scale_factor * scale_factor,
+ n_patches / scale_factor / scale_factor);
+ cb(cur, "pixel_shuffle", -1);
+ }
+
+ // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
+ {
+ cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
+ cur = ggml_gelu(ctx0, cur);
+ cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
+ cur = ggml_gelu(ctx0, cur);
+ cb(cur, "adapter_mlp", -1);
+ }
+
+ // Llama4MultiModalProjector
+ cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+ cb(cur, "projected", -1);
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/llava.cpp b/llama.cpp/tools/mtmd/models/llava.cpp
new file mode 100644
index 0000000..0bfb5f0
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/llava.cpp
@@ -0,0 +1,374 @@
+#include "models.h"
+
+// this graph is used by llava, granite and glm
+// due to having embedding_stack (used by granite), we cannot reuse build_vit
+ggml_cgraph * clip_graph_llava::build() {
+ const int batch_size = 1;
+ const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
+
+ GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
+
+ // Calculate the deepest feature layer based on hparams and projector type
+ int max_feature_layer = n_layer;
+ {
+ // Get the index of the second to last layer; this is the default for models that have a llava projector
+ int il_last = hparams.n_layer - 1;
+ int deepest_feature_layer = -1;
+
+ if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+ il_last += 1;
+ }
+
+ // If we set explicit vision feature layers, only go up to the deepest one
+ // NOTE: only used by granite-vision models for now
+ for (const auto & feature_layer : hparams.vision_feature_layer) {
+ if (feature_layer > deepest_feature_layer) {
+ deepest_feature_layer = feature_layer;
+ }
+ }
+ max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
+ }
+
+ ggml_tensor * inp = build_inp();
+
+ // concat class_embeddings and patch_embeddings
+ if (model.class_embedding) {
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+ }
+
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
+
+ ggml_tensor * inpL = inp;
+
+ // pre-layernorm
+ if (model.pre_ln_w) {
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
+ cb(inpL, "pre_ln", -1);
+ }
+
+ std::vector<ggml_tensor *> embedding_stack;
+ const auto & vision_feature_layer = hparams.vision_feature_layer;
+
+ // loop over layers
+ for (int il = 0; il < max_feature_layer; il++) {
+ auto & layer = model.layers[il];
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+ // If this is an embedding feature layer, save the output.
+ // NOTE: 0 index here refers to the input to the encoder.
+ if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
+ embedding_stack.push_back(cur);
+ }
+
+ // layernorm1
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+ cb(cur, "layer_inp_normed", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+ if (layer.q_b) {
+ Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+ }
+
+ ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+ if (layer.k_b) {
+ Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+ }
+
+ ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+ if (layer.v_b) {
+ Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ // re-add the layer input, e.g., residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ inpL = cur; // inpL = residual, cur = hidden_states
+
+ cb(cur, "ffn_inp", il);
+
+ // layernorm2
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+ cb(cur, "ffn_inp_normed", il);
+
+ // ffn
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ layer.ff_gate_w, layer.ff_gate_b,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ cb(cur, "ffn_out", il);
+
+ // residual 2
+ cur = ggml_add(ctx0, inpL, cur);
+ cb(cur, "layer_out", il);
+
+ inpL = cur;
+ }
+
+ // post-layernorm
+ if (model.post_ln_w) {
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
+ }
+
+ ggml_tensor * embeddings = inpL;
+
+ // process vision feature layers (used by granite)
+ {
+ // final layer is a vision feature layer
+ if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
+ embedding_stack.push_back(inpL);
+ }
+
+ // If feature layers are explicitly set, stack them (if we have multiple)
+ if (!embedding_stack.empty()) {
+ embeddings = embedding_stack[0];
+ for (size_t i = 1; i < embedding_stack.size(); i++) {
+ embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
+ }
+ }
+ }
+
+ // llava projector (also used by granite)
+ if (hparams.has_llava_projector) {
+ embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
+
+ ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(patches, "patches");
+ ggml_set_input(patches);
+
+ // shape [1, 576, 1024]
+ // ne is whcn, ne = [1024, 576, 1, 1]
+ embeddings = ggml_get_rows(ctx0, embeddings, patches);
+
+ // print_tensor_info(embeddings, "embeddings");
+
+ // llava projector
+ if (proj_type == PROJECTOR_TYPE_MLP) {
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+ embeddings = ggml_gelu(ctx0, embeddings);
+ if (model.mm_2_w) {
+ embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+ }
+ }
+ else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+ // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+ // First LayerNorm
+ embeddings = ggml_norm(ctx0, embeddings, eps);
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+ model.mm_1_b);
+
+ // GELU activation
+ embeddings = ggml_gelu(ctx0, embeddings);
+
+ // Second linear layer
+ embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+ // Second LayerNorm
+ embeddings = ggml_norm(ctx0, embeddings, eps);
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+ model.mm_4_b);
+ }
+ else if (proj_type == PROJECTOR_TYPE_LDP) {
+ // MobileVLM projector
+ int n_patch = 24;
+ ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+ mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+ mlp_1 = ggml_gelu(ctx0, mlp_1);
+ ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+ mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+ // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
+
+ // block 1
+ ggml_tensor * block_1 = nullptr;
+ {
+ // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+ mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
+ mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+ // stride = 1, padding = 1, bias is nullptr
+ block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+
+ // layer norm
+ // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+ // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+ block_1 = ggml_norm(ctx0, block_1, eps);
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+
+ // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+ // hardswish
+ ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+ block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+ // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+ // pointwise conv
+ block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+ block_1 = ggml_relu(ctx0, block_1);
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+ block_1 = ggml_hardsigmoid(ctx0, block_1);
+ // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+ block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+ block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+ int w = block_1->ne[0], h = block_1->ne[1];
+ block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+ block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+ // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+ block_1 = ggml_norm(ctx0, block_1, eps);
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+ // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+ // residual
+ block_1 = ggml_add(ctx0, mlp_3, block_1);
+ }
+
+ // block_2
+ {
+ // stride = 2
+ block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+
+ // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+ // layer norm
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+ // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+ block_1 = ggml_norm(ctx0, block_1, eps);
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+ // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+ // hardswish
+ ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+ // not sure the parameters is right for globalAvgPooling
+ block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+ // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+ // pointwise conv
+ block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+ block_1 = ggml_relu(ctx0, block_1);
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+ block_1 = ggml_hardsigmoid(ctx0, block_1);
+
+ // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+ block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+ block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+ int w = block_1->ne[0], h = block_1->ne[1];
+ block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+ block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
+ // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+ block_1 = ggml_norm(ctx0, block_1, eps);
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
+ block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+ // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+ }
+ embeddings = block_1;
+ }
+ else if (proj_type == PROJECTOR_TYPE_LDPV2)
+ {
+ int n_patch = 24;
+ ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+ mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
+ mlp_0 = ggml_gelu(ctx0, mlp_0);
+ ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+ mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
+ // mlp_2 ne = [2048, 576, 1, 1]
+ // // AVG Pool Layer 2*2, strides = 2
+ mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
+ // mlp_2 ne = [576, 2048, 1, 1]
+ mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+ // mlp_2 ne [24, 24, 2048, 1]
+ mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
+ // weight ne = [3, 3, 2048, 1]
+ ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+ peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
+ peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+ mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
+ peg_0 = ggml_add(ctx0, peg_0, mlp_2);
+ peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
+ embeddings = peg_0;
+ }
+ else {
+ GGML_ABORT("fatal error");
+ }
+ }
+
+ // glm projector
+ else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+ size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
+ embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
+ embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
+ embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
+ embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
+ embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
+ embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
+ // GLU
+ {
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+ embeddings = ggml_norm(ctx0, embeddings, eps);
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+ embeddings = ggml_gelu_inplace(ctx0, embeddings);
+ ggml_tensor * x = embeddings;
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
+ x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
+ embeddings = ggml_swiglu_split(ctx0, embeddings, x);
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
+ }
+ // arrangement of BOI/EOI token embeddings
+ // note: these embeddings are not present in text model, hence we cannot process them as text tokens
+ // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
+ {
+ embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
+ embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
+ }
+ }
+
+ else {
+ GGML_ABORT("llava: unknown projector type");
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/minicpmv.cpp b/llama.cpp/tools/mtmd/models/minicpmv.cpp
new file mode 100644
index 0000000..3594ea2
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/minicpmv.cpp
@@ -0,0 +1,114 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_minicpmv::build() {
+ GGML_ASSERT(model.class_embedding == nullptr);
+ const int n_pos = n_patches;
+ const int n_embd_proj = n_mmproj_embd;
+
+ // position embeddings for the projector (not for ViT)
+ // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
+ // base frequency omega
+ ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
+ ggml_set_name(omega, "omega");
+ ggml_set_input(omega);
+
+ // 2D input positions (using float for sinusoidal embeddings)
+ ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+ ggml_set_name(pos_h, "pos_h");
+ ggml_set_input(pos_h);
+ ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+ ggml_set_name(pos_w, "pos_w");
+ ggml_set_input(pos_w);
+
+ // for selecting learned pos embd, used by ViT
+ struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
+
+ ggml_tensor * inp = build_inp();
+ ggml_tensor * embeddings = build_vit(
+ inp, n_pos,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ learned_pos_embd,
+ nullptr);
+
+ // resampler projector (it is just another transformer)
+
+ ggml_tensor * q = model.mm_model_query;
+ ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+
+ // norm
+ q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
+ v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
+
+ // calculate sinusoidal pos embd
+ ggml_tensor * pos_embed = nullptr;
+ {
+ // outer product
+ ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
+ ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
+ ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
+ // sin and cos
+ ggml_tensor * pos_embd_x = ggml_concat(
+ ctx0,
+ ggml_sin(ctx0, theta_x),
+ ggml_cos(ctx0, theta_x),
+ 0 // concat on first dim
+ );
+ ggml_tensor * pos_embd_y = ggml_concat(
+ ctx0,
+ ggml_sin(ctx0, theta_y),
+ ggml_cos(ctx0, theta_y),
+ 0 // concat on first dim
+ );
+ pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
+ }
+
+ // k = v + pos_embed
+ ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
+
+ // attention
+ {
+ const int d_head = 128;
+ int n_head = n_embd_proj/d_head;
+ // Use actual config value if available, otherwise fall back to hardcoded values
+ int num_query = hparams.minicpmv_query_num;
+ ggml_tensor * Q = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
+ model.mm_model_attn_q_b);
+ ggml_tensor * K = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
+ model.mm_model_attn_k_b);
+ ggml_tensor * V = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
+ model.mm_model_attn_v_b);
+
+ Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
+ K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
+ V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
+
+ cb(Q, "resampler_Q", -1);
+ cb(K, "resampler_K", -1);
+ cb(V, "resampler_V", -1);
+
+ float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
+ embeddings = build_attn(
+ model.mm_model_attn_o_w,
+ model.mm_model_attn_o_b,
+ Q, K, V, nullptr, resampler_kq_scale, -1);
+ cb(embeddings, "resampler_attn_out", -1);
+ }
+ // layernorm
+ embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
+
+ // projection
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+
+ // build the graph
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/mobilenetv5.cpp b/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
new file mode 100644
index 0000000..593afa1
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
@@ -0,0 +1,451 @@
+#include "models.h"
+
+// Helpers for MobileNetV5 Blocks
+// RMS Norm 2D - normalizes over channels for each spatial position
+ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
+ // inp: [W, H, C, B]
+
+ ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
+ cur = ggml_cont(ctx0, cur);
+ cur = ggml_rms_norm(ctx0, cur, eps);
+
+ if (weight) {
+ cur = ggml_mul(ctx0, cur, weight);
+ }
+
+ cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
+ cur = ggml_cont(ctx0, cur);
+
+ return cur;
+}
+
+// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
+ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
+ const int64_t ih = inp->ne[1]; // height
+ const int64_t iw = inp->ne[0]; // width
+
+ // Calculate output size (ceil division)
+ const int64_t oh = (ih + stride_h - 1) / stride_h;
+ const int64_t ow = (iw + stride_w - 1) / stride_w;
+
+ // Calculate padding needed
+ const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
+ const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
+
+ // Split padding asymmetrically
+ const int pad_h_top = pad_h / 2;
+ const int pad_h_bottom = pad_h - pad_h_top;
+ const int pad_w_left = pad_w / 2;
+ const int pad_w_right = pad_w - pad_w_left;
+
+ // Apply padding if needed
+ // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+ // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
+ if (pad_h > 0 || pad_w > 0) {
+ inp = ggml_pad_ext(ctx0, inp,
+ pad_w_left, pad_w_right, // width padding (dim 0)
+ pad_h_top, pad_h_bottom, // height padding (dim 1)
+ 0, 0, // no channel padding (dim 2)
+ 0, 0); // no batch padding (dim 3)
+ }
+
+ return inp;
+}
+
+
+// Edge Residual Block (Stage 0)
+ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+ ggml_tensor * cur = inp;
+
+ // 1. Expansion Conv (3x3)
+ if (stride == 2) {
+ // Case: Downsampling (Block 0)
+ // Replicates Conv2dSame(kernel=3, stride=2)
+ cur = pad_same_2d(cur, 3, 3, stride, stride);
+ cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
+ } else {
+ // Case: Normal 3x3 Block (Block 1, 2)
+ // Replicates Conv2d(kernel=3, stride=1, padding=1)
+ cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
+ }
+
+ // BN + Activation
+ if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
+ cur = ggml_gelu(ctx0, cur);
+
+ // 2. Pointwise Linear Conv (1x1)
+ // 1x1 Convs usually have padding=0 and stride=1
+ cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
+ if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
+
+ // 3. Residual Connection
+ // Only apply residual if spatial dimensions and channels match (stride 1)
+ if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
+ cur = ggml_add(ctx0, cur, inp);
+ }
+
+ return cur;
+}
+
+// Universal Inverted Residual Block (Stage 1+)
+ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+ ggml_tensor * cur = inp;
+
+ // 1. Depthwise Start (Optional)
+ // NOTE: dw_start always has stride=1 (no downsampling here)
+ if (block.dw_start_w) {
+ int k = block.dw_start_w->ne[0]; // 3 or 5
+ int p = k / 2;
+ cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
+ if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
+ }
+
+ // 2. Pointwise Expansion (1x1)
+ if (block.pw_exp_w) {
+ // Standard 1x1 conv, pad=0, stride=1
+ cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
+ if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
+ cur = ggml_gelu(ctx0, cur);
+ }
+
+ // 3. Depthwise Mid (Optional)
+ // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
+ if (block.dw_mid_w) {
+ int k = block.dw_mid_w->ne[0]; // 3 or 5
+
+ if (stride > 1) {
+ // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
+ cur = pad_same_2d(cur, k, k, stride, stride);
+ cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
+ } else {
+ // Case: Stride 1 -> Use Standard Symmetric Padding
+ int p = k / 2;
+ cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
+ }
+
+ if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
+ cur = ggml_gelu(ctx0, cur);
+ }
+
+ // 4. Pointwise Projection (1x1)
+ if (block.pw_proj_w) {
+ cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
+ if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
+ }
+
+ // Apply Layer Scaling if present
+ if (block.layer_scale_w) {
+ cur = ggml_mul(ctx0, cur, block.layer_scale_w);
+ }
+
+ // 5. Residual Connection
+ bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
+ bool same_channel = (inp->ne[2] == cur->ne[2]);
+ if (same_spatial && same_channel) {
+ cur = ggml_add(ctx0, cur, inp);
+ }
+
+ return cur;
+}
+
+// Attention Block (MQA)
+ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
+ ggml_tensor * cur = inp;
+
+ // Norm
+ if (block.attn_norm_w) {
+ cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
+ }
+
+ // 1. Q Calculation
+ ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
+
+ // 2. K Calculation (Downsampled)
+ // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+ ggml_tensor * k_inp = cur;
+ if (block.attn_k_dw_w) {
+ int k_size = block.attn_k_dw_w->ne[0]; // Usually 3
+ k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding
+ k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0
+ if (block.attn_k_norm_w) {
+ k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
+ }
+ }
+ ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
+
+ // 3. V Calculation (Downsampled)
+ // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+ ggml_tensor * v_inp = cur;
+ if (block.attn_v_dw_w) {
+ int v_size = block.attn_v_dw_w->ne[0]; // Usually 3
+ v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding
+ v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0
+ if (block.attn_v_norm_w) {
+ v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
+ }
+ }
+ ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
+
+ const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
+ const int D = k->ne[2]; // Head dimension
+ const int n_head = q->ne[2] / D;
+ const int N = W * H;
+
+ // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
+ q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
+ q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
+ q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
+ q = ggml_cont(ctx0, q);
+
+ const int Wk = k->ne[0]; const int Hk = k->ne[1];
+ const int M = Wk * Hk;
+
+ // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
+ k = ggml_reshape_3d(ctx0, k, M, D, B);
+ k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
+ k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
+ k = ggml_cont(ctx0, k);
+
+ // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
+ v = ggml_reshape_3d(ctx0, v, M, D, B);
+ v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
+ v = ggml_cont(ctx0, v); // [M, D, 1, B]
+
+ // Multi-Query Attention
+ float scale = 1.0f / sqrtf((float)D);
+
+ // Step 1: Compute Q @ K.T
+ ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
+
+ scores = ggml_scale(ctx0, scores, scale);
+
+ scores = ggml_soft_max(ctx0, scores);
+
+ ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
+
+ kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
+ kqv = ggml_cont(ctx0, kqv);
+
+
+ kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
+ kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
+ kqv = ggml_cont(ctx0, kqv);
+
+ // Output projection
+ cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
+
+ // Residual & Layer Scale
+ if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
+ if (block.layer_scale_w) {
+ cur = ggml_mul(ctx0, cur, block.layer_scale_w);
+ }
+ cur = ggml_add(ctx0, cur, inp);
+ }
+
+ return cur;
+}
+
+ggml_cgraph * clip_graph_mobilenetv5::build() {
+ ggml_tensor * inp = build_inp_raw();
+
+ // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
+ ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding
+
+ cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0
+ if (model.mobilenet_stem_conv_b) {
+ cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
+ }
+ if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
+ cur = ggml_gelu(ctx0, cur);
+
+
+ // 2. Blocks
+ std::vector<ggml_tensor*> intermediate_features;
+ const int total_blocks = model.mobilenet_blocks.size();
+
+ auto is_stage_start = [&](int i) {
+ if (i == 0) return true;
+ for (int end_idx : model.mobilenet_stage_ends) {
+ if (i == end_idx + 1) return true;
+ }
+ return false;
+ };
+
+ auto is_fusion_point = [&](int i) {
+ if (model.mobilenet_stage_ends.size() >= 4) {
+ if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
+ if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
+ } else {
+ if (i == total_blocks - 1) return true;
+ }
+ return false;
+ };
+
+ for (int i = 0; i < total_blocks; i++) {
+ const auto & block = model.mobilenet_blocks[i];
+ int stride = is_stage_start(i) ? 2 : 1;
+
+ if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride);
+ else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block);
+ else cur = build_inverted_residual(cur, block, stride);
+
+ if (is_fusion_point(i)) {
+
+ intermediate_features.push_back(cur);
+ }
+ }
+
+ // 3. Multi-Scale Fusion Adapter (MSFA)
+ if (!intermediate_features.empty()) {
+
+ // A. Reference Resolution: PyTorch implementation uses inputs[0]
+ // We assume intermediate_features[0] is the "High Resolution" target.
+ // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
+ ggml_tensor* target_feat = intermediate_features[0];
+ int high_res_w = target_feat->ne[0];
+ int high_res_h = target_feat->ne[1];
+
+ std::vector<ggml_tensor*> resized_feats;
+
+ // B. Resize inputs to match inputs[0] (High Resolution)
+ for (auto feat : intermediate_features) {
+ int feat_w = feat->ne[0];
+ int feat_h = feat->ne[1];
+
+ // PyTorch: if feat_size < high_resolution: interpolate
+ if (feat_w < high_res_w || feat_h < high_res_h) {
+ // Calculate scale factor.
+ // Note: PyTorch 'nearest' works on arbitrary float scales.
+ // ggml_upscale generally takes integer factors or target sizes depending on helper.
+ // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
+ int scale_w = high_res_w / feat_w;
+ // int scale_h = high_res_h / feat_h;
+
+ // Safety check for non-integer scaling if strictly replicating
+ GGML_ASSERT(high_res_w % feat_w == 0);
+
+ // Upsample (Nearest Neighbor)
+ // 2 is the scale factor
+ feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
+ }
+ resized_feats.push_back(feat);
+ }
+
+ // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
+ cur = resized_feats[0];
+ for (size_t k = 1; k < resized_feats.size(); ++k) {
+ cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
+ }
+
+ // D. FFN (UniversalInvertedResidual)
+ // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
+
+ // 1. Expansion
+ if (model.msfa_ffn_expand_w) {
+ // 1x1 Conv
+ cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
+
+ if (model.msfa_ffn_expand_bn) {
+ cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
+ }
+
+ cur = ggml_gelu(ctx0, cur);
+
+ }
+
+ // 2. Projection (No DW because kernel_size=0)
+ if (model.msfa_ffn_project_w) {
+ // 1x1 Conv
+ cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
+
+ // UniversalInvertedResidual typically has a norm after projection
+ if (model.msfa_ffn_project_bn) {
+ cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
+ }
+
+ }
+
+ // E. Final Downsample to Target Resolution (Output Resolution)
+ // PyTorch: matches self.output_resolution (e.g. 16x16)
+ const int target_out_res = 16;
+ int current_w = cur->ne[0];
+
+ if (current_w > target_out_res) {
+ int s = current_w / target_out_res;
+
+ GGML_ASSERT(current_w % target_out_res == 0);
+
+ // Avg Pool: Kernel=s, Stride=s
+ cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
+
+ }
+
+ // F. Final Norm
+ if (model.msfa_concat_norm_w) {
+ cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
+
+ }
+ }
+
+ // 4. Gemma 3n Multimodal Projection (Embedder)
+ // Input: 'cur' is [Width, Height, Channels, Batch]
+ int W = cur->ne[0];
+ int H = cur->ne[1];
+ int C = cur->ne[2];
+ int B = cur->ne[3];
+
+ GGML_ASSERT(C == hparams.n_embd);
+
+ // 1. Permute and Flatten to [Channels, Tokens, Batch]
+ // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
+ cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
+ cur = ggml_cont(ctx0, cur);
+ cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
+ cur = ggml_cont(ctx0, cur);
+
+
+ // 2. FEATURE SCALING
+ // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
+ const float scale_factor = sqrtf((float)C);
+ cur = ggml_scale(ctx0, cur, scale_factor);
+
+
+ // 3. SOFT EMBEDDING NORM
+ // PyTorch: self._norm(x) * self.weight
+ // We must normalize regardless, then multiply if weight exists.
+ {
+ const float eps = 1e-6f; // Gemma3n uses 1e-6
+ cur = ggml_rms_norm(ctx0, cur, eps);
+
+ if (model.mm_soft_emb_norm_w) {
+ // Weight shape is (2048,) -> Element-wise broadcast multiply
+ cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+ }
+
+ }
+
+ // 4. PROJECTION
+ // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
+ // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
+ if (model.mm_input_proj_w) {
+ cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
+ }
+
+ // 5. POST PROJECTION NORM
+ // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
+ // with_scale=False means weight is registered as buffer with value 1.0
+ // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
+ {
+ const float eps = 1e-6f;
+ cur = ggml_rms_norm(ctx0, cur, eps);
+
+ if (model.mm_post_proj_norm_w) {
+ // If weight is loaded, multiply (should be ~1.0 anyway)
+ cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
+ }
+ }
+
+ ggml_build_forward_expand(gf, cur);
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/models.h b/llama.cpp/tools/mtmd/models/models.h
new file mode 100644
index 0000000..c4c67ac
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/models.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include "../clip-graph.h"
+
+/*
+ * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
+ * We encourage human contributors to ensure the quality and reliability of the codebase.
+ */
+
+struct clip_graph_siglip : clip_graph {
+ clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_pixtral : clip_graph {
+ clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_qwen2vl : clip_graph {
+ clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_qwen3vl : clip_graph {
+ clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_youtuvl : clip_graph {
+ clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_minicpmv : clip_graph {
+ clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_internvl : clip_graph {
+ clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_llama4 : clip_graph {
+ clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_kimivl : clip_graph {
+ clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_cogvlm : clip_graph {
+ clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_llava : clip_graph {
+ clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_whisper_enc : clip_graph {
+ clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_conformer : clip_graph {
+ clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_glm4v : clip_graph {
+ clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_mobilenetv5 : clip_graph {
+ clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+
+ ggml_tensor * rms_norm_2d(
+ ggml_tensor * inp,
+ ggml_tensor * weight,
+ float eps = 1e-6f);
+
+ ggml_tensor* pad_same_2d(
+ ggml_tensor* inp,
+ int kernel_h,
+ int kernel_w,
+ int stride_h,
+ int stride_w,
+ int dilation_h = 1,
+ int dilation_w = 1);
+
+ ggml_tensor * build_edge_residual(
+ ggml_tensor * inp,
+ const mobilenetv5_block & block,
+ int stride);
+
+ ggml_tensor * build_inverted_residual(
+ ggml_tensor * inp,
+ const mobilenetv5_block & block,
+ int stride);
+
+ ggml_tensor * build_mobilenet_attn(
+ ggml_tensor * inp,
+ const mobilenetv5_block & block);
+};
+
+struct clip_graph_kimik25 : clip_graph {
+ clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+
+ ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode);
+};
diff --git a/llama.cpp/tools/mtmd/models/pixtral.cpp b/llama.cpp/tools/mtmd/models/pixtral.cpp
new file mode 100644
index 0000000..a849210
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/pixtral.cpp
@@ -0,0 +1,86 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_pixtral::build() {
+ const int n_merge = hparams.n_merge;
+
+ // 2D input positions
+ ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_h, "pos_h");
+ ggml_set_input(pos_h);
+
+ ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_w, "pos_w");
+ ggml_set_input(pos_w);
+
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+ return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
+ };
+
+ ggml_tensor * inp = build_inp();
+ ggml_tensor * cur = build_vit(
+ inp, n_patches,
+ NORM_TYPE_RMS,
+ hparams.ffn_op,
+ nullptr, // no learned pos embd
+ add_pos);
+
+ // mistral small 3.1 patch merger
+ // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
+ if (model.mm_patch_merger_w) {
+ GGML_ASSERT(hparams.n_merge > 0);
+
+ cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
+
+ // reshape image tokens to 2D grid
+ cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
+ cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
+ cur = ggml_cont(ctx0, cur);
+
+ // torch.nn.functional.unfold is just an im2col under the hood
+ // we just need a dummy kernel to make it work
+ ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
+ cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
+
+ // project to n_embd
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+ cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
+ }
+
+ // LlavaMultiModalProjector (always using GELU activation)
+ {
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
+ }
+
+ // arrangement of the [IMG_BREAK] token
+ if (model.token_embd_img_break) {
+ // not efficient, but works
+ // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
+ // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
+ // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
+
+ const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
+ const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
+ const int p_total = p_x * p_y;
+ const int n_embd_text = cur->ne[0];
+ const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
+
+ ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
+ ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
+ tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
+ tok = ggml_add(ctx0, tok, model.token_embd_img_break);
+ tmp = ggml_concat(ctx0, tmp, tok, 1);
+ cur = ggml_view_2d(ctx0, tmp,
+ n_embd_text, n_tokens_output,
+ ggml_row_size(tmp->type, n_embd_text), 0);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/qwen2vl.cpp b/llama.cpp/tools/mtmd/models/qwen2vl.cpp
new file mode 100644
index 0000000..85f158b
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/qwen2vl.cpp
@@ -0,0 +1,183 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_qwen2vl::build() {
+ GGML_ASSERT(model.patch_bias == nullptr);
+ GGML_ASSERT(model.class_embedding == nullptr);
+
+ const int batch_size = 1;
+ const bool use_window_attn = hparams.n_wa_pattern > 0;
+ const int n_wa_pattern = hparams.n_wa_pattern;
+ const int n_pos = n_patches;
+ const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+ norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL
+ ? NORM_TYPE_RMS // qwen 2.5 vl
+ : NORM_TYPE_NORMAL; // qwen 2 vl
+
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+ ggml_tensor * inp_raw = build_inp_raw();
+ ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+ GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+ GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+ // second conv dimension
+ {
+ auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+ inp = ggml_add(ctx0, inp, inp_1);
+
+ inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ inp = ggml_reshape_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+ inp = ggml_cont_3d(
+ ctx0, inp,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ ggml_tensor * inpL = inp;
+ ggml_tensor * window_mask = nullptr;
+ ggml_tensor * window_idx = nullptr;
+ ggml_tensor * inv_window_idx = nullptr;
+
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ // pre-layernorm
+ if (model.pre_ln_w) {
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+ }
+
+ if (use_window_attn) {
+ // handle window attention inputs
+ inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+ ggml_set_name(inv_window_idx, "inv_window_idx");
+ ggml_set_input(inv_window_idx);
+ // mask for window attention
+ window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
+ ggml_set_name(window_mask, "window_mask");
+ ggml_set_input(window_mask);
+
+ // if flash attn is used, we need to pad the mask and cast to f16
+ if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+ window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+ }
+
+ // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+ GGML_ASSERT(batch_size == 1);
+ inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
+ inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ // loop over layers
+ for (int il = 0; il < n_layer; il++) {
+ const auto & layer = model.layers[il];
+ const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
+
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+ // layernorm1
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+ cb(cur, "ln1", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
+ ggml_tensor * Kcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
+ ggml_tensor * Vcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // apply M-RoPE
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+ cb(Qcur, "Qcur_rope", il);
+ cb(Kcur, "Kcur_rope", il);
+
+ ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ // re-add the layer input, e.g., residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ inpL = cur; // inpL = residual, cur = hidden_states
+
+ cb(cur, "ffn_inp", il);
+
+ // layernorm2
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+ cb(cur, "ffn_inp_normed", il);
+
+ // ffn
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ layer.ff_gate_w, layer.ff_gate_b,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ cb(cur, "ffn_out", il);
+
+ // residual 2
+ cur = ggml_add(ctx0, inpL, cur);
+ cb(cur, "layer_out", il);
+
+ inpL = cur;
+ }
+
+ // post-layernorm
+ if (model.post_ln_w) {
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+ }
+
+ // multimodal projection
+ ggml_tensor * embeddings = inpL;
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+ embeddings = build_ffn(embeddings,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ FFN_GELU,
+ -1);
+
+ if (use_window_attn) {
+ window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+ ggml_set_name(window_idx, "window_idx");
+ ggml_set_input(window_idx);
+
+ // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+ GGML_ASSERT(batch_size == 1);
+ embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
+ embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+ embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/qwen3vl.cpp b/llama.cpp/tools/mtmd/models/qwen3vl.cpp
new file mode 100644
index 0000000..5ecb10f
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/qwen3vl.cpp
@@ -0,0 +1,193 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_qwen3vl::build() {
+ GGML_ASSERT(model.patch_bias != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+ GGML_ASSERT(model.class_embedding == nullptr);
+
+ const int batch_size = 1;
+ const int n_pos = n_patches;
+ const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+ norm_type norm_t = NORM_TYPE_NORMAL;
+
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+ ggml_tensor * inp_raw = build_inp_raw();
+ ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+ GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+ GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+ // second conv dimension
+ {
+ auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+ inp = ggml_add(ctx0, inp, inp_1);
+
+ inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ inp = ggml_reshape_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+ inp = ggml_cont_3d(
+ ctx0, inp,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ // add patch bias
+ if (model.patch_bias != nullptr) {
+ inp = ggml_add(ctx0, inp, model.patch_bias);
+ cb(inp, "patch_bias", -1);
+ }
+
+ // calculate absolute position embedding and apply
+ ggml_tensor * learned_pos_embd = resize_position_embeddings();
+ learned_pos_embd = ggml_cont_4d(
+ ctx0, learned_pos_embd,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ learned_pos_embd = ggml_reshape_4d(
+ ctx0, learned_pos_embd,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+ learned_pos_embd = ggml_cont_3d(
+ ctx0, learned_pos_embd,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ inp = ggml_add(ctx0, inp, learned_pos_embd);
+ cb(inp, "inp_pos_emb", -1);
+
+ ggml_tensor * inpL = inp;
+
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ // pre-layernorm
+ if (model.pre_ln_w) {
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+ }
+
+ // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
+ ggml_tensor * deepstack_features = nullptr;
+ const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
+
+ // loop over layers
+ for (int il = 0; il < n_layer; il++) {
+ auto & layer = model.layers[il];
+
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+ // layernorm1
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+ cb(cur, "ln1", il);
+
+ // self-attention
+ {
+ cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+ cur = ggml_add(ctx0, cur, layer.qkv_b);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ 0);
+
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ ggml_row_size(cur->type, n_embd));
+
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ ggml_row_size(cur->type, 2 * n_embd));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // apply M-RoPE
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+ cb(Qcur, "Qcur_rope", il);
+ cb(Kcur, "Kcur_rope", il);
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ // re-add the layer input, e.g., residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ inpL = cur; // inpL = residual, cur = hidden_states
+
+ cb(cur, "ffn_inp", il);
+
+ // layernorm2
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+ cb(cur, "ffn_inp_normed", il);
+
+ // ffn
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ layer.ff_gate_w, layer.ff_gate_b,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ cb(cur, "ffn_out", il);
+
+ // residual 2
+ cur = ggml_add(ctx0, inpL, cur);
+ cb(cur, "layer_out", il);
+
+ if (layer.has_deepstack()) {
+ ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
+ feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
+ feat = build_ffn(feat,
+ layer.deepstack_fc1_w, layer.deepstack_fc1_b,
+ nullptr, nullptr,
+ layer.deepstack_fc2_w, layer.deepstack_fc2_b,
+ ffn_op_type::FFN_GELU, il);
+
+ if(!deepstack_features) {
+ deepstack_features = feat;
+ } else {
+ // concat along the feature dimension
+ deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
+ }
+ }
+
+ inpL = cur;
+ }
+
+ // post-layernorm
+ if (model.post_ln_w) {
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+ }
+
+ // multimodal projection
+ ggml_tensor * embeddings = inpL;
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+
+ embeddings = build_ffn(embeddings,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ ffn_op_type::FFN_GELU, -1);
+
+ if (deepstack_features) {
+ embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0);
+ } // concat along the feature dimension
+
+ // build the graph
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/siglip.cpp b/llama.cpp/tools/mtmd/models/siglip.cpp
new file mode 100644
index 0000000..b866a11
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/siglip.cpp
@@ -0,0 +1,86 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_siglip::build() {
+ ggml_tensor * inp = build_inp();
+
+ ggml_tensor * learned_pos_embd = model.position_embeddings;
+ if (proj_type == PROJECTOR_TYPE_LFM2) {
+ learned_pos_embd = resize_position_embeddings();
+ }
+
+ ggml_tensor * cur = build_vit(
+ inp, n_patches,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ learned_pos_embd,
+ nullptr);
+
+ if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+ const int batch_size = 1;
+ GGML_ASSERT(n_patches_x == n_patches_y);
+ const int patches_per_image = n_patches_x;
+ const int kernel_size = hparams.n_merge;
+
+ cur = ggml_transpose(ctx0, cur);
+ cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
+
+ // doing a pool2d to reduce the number of output tokens
+ cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ // apply norm before projection
+ cur = ggml_rms_norm(ctx0, cur, eps);
+ cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+
+ // apply projection
+ cur = ggml_mul_mat(ctx0,
+ ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
+ cur);
+
+ } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
+ // pixel_shuffle
+ // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
+ const int scale_factor = model.hparams.n_merge;
+ cur = build_patch_merge_permute(cur, scale_factor);
+ cur = ggml_mul_mat(ctx0, model.projection, cur);
+
+ } else if (proj_type == PROJECTOR_TYPE_LFM2) {
+ // pixel unshuffle block
+ const int scale_factor = model.hparams.n_merge;
+ cur = build_patch_merge_permute(cur, scale_factor);
+
+ // projection, in LFM2-VL input norm is optional
+ if (model.mm_input_norm_w) {
+ cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+ cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+ }
+
+ if (model.mm_input_norm_b) {
+ cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+ }
+
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
+
+ } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
+ cur = build_ffn(cur,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ hparams.ffn_op,
+ -1);
+
+ } else {
+ GGML_ABORT("SigLIP: Unsupported projector type");
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/whisper-enc.cpp b/llama.cpp/tools/mtmd/models/whisper-enc.cpp
new file mode 100644
index 0000000..2f2b127
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/whisper-enc.cpp
@@ -0,0 +1,115 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_whisper_enc::build() {
+ const int n_frames = img.nx;
+ const int n_pos = n_frames / 2;
+ GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+ ggml_tensor * inp = build_inp_raw(1);
+
+ // conv1d block
+ {
+ // convolution + gelu
+ ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
+ cur = ggml_add(ctx0, cur, model.conv1d_1_b);
+
+ cur = ggml_gelu_erf(ctx0, cur);
+
+ cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
+ cur = ggml_add(ctx0, cur, model.conv1d_2_b);
+
+ cur = ggml_gelu_erf(ctx0, cur);
+ // transpose
+ inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+ cb(inp, "after_conv1d", -1);
+ }
+
+ // sanity check (only check one layer, but it should be the same for all)
+ GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
+ GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
+ GGML_ASSERT(model.layers[0].q_b);
+ GGML_ASSERT(model.layers[0].v_b);
+ GGML_ASSERT(!model.layers[0].k_b); // no bias for k
+
+ ggml_tensor * pos_embd_selected = ggml_view_2d(
+ ctx0, model.position_embeddings,
+ model.position_embeddings->ne[0], n_pos,
+ model.position_embeddings->nb[1], 0
+ );
+ ggml_tensor * cur = build_vit(
+ inp, n_pos,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ pos_embd_selected,
+ nullptr);
+
+ cb(cur, "after_transformer", -1);
+
+ if (model.audio_has_stack_frames()) {
+ // StackAudioFrames
+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
+ cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
+ cb(cur, "after_stacked", -1);
+ }
+
+ if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
+ // UltravoxProjector
+ // pre-norm
+ cur = ggml_rms_norm(ctx0, cur, 1e-6);
+ cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+
+ // ffn in
+ cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
+
+ // swiglu
+ // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
+ cur = ggml_swiglu_swapped(ctx0, cur);
+
+ // mid-norm
+ cur = ggml_rms_norm(ctx0, cur, 1e-6);
+ cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
+
+ // ffn out
+ cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+
+ } else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
+ // projector
+ cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
+ cur = ggml_add(ctx0, cur, model.mm_fc_b);
+
+ } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
+ // projector
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU_ERF,
+ -1);
+
+ } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
+ // projector
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU_ERF,
+ -1);
+
+ } else if (proj_type == PROJECTOR_TYPE_GLMA) {
+ cur = ggml_norm(ctx0, cur, hparams.eps);
+ cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+ cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
+ cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
+ cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
+ cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
+ cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
+ } else {
+ GGML_ABORT("%s: unknown projector type", __func__);
+ }
+
+ cb(cur, "projected", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/youtuvl.cpp b/llama.cpp/tools/mtmd/models/youtuvl.cpp
new file mode 100644
index 0000000..ffbf2be
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/youtuvl.cpp
@@ -0,0 +1,179 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_youtuvl::build() {
+ GGML_ASSERT(model.class_embedding == nullptr);
+ const int batch_size = 1;
+ const bool use_window_attn = !hparams.wa_layer_indexes.empty();
+ const int n_pos = n_patches;
+ const int num_position_ids = n_pos * 4;
+ const int m = 2;
+ const int Wp = n_patches_x;
+ const int Hp = n_patches_y;
+ const int Hm = Hp / m;
+ const int Wm = Wp / m;
+ norm_type norm_t = NORM_TYPE_NORMAL;
+
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+ ggml_tensor * inp = build_inp_raw();
+
+ // change conv3d to linear
+ // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
+ {
+ inp = ggml_reshape_4d(
+ ctx0, inp,
+ Wm * m * patch_size, m * patch_size, Hm, 3);
+ inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ m * patch_size * 3, Wm, m * patch_size, Hm);
+
+ inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ m * patch_size * 3, patch_size, m, Hm * Wm);
+
+ inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ patch_size, 3, patch_size, Hm * Wm * m * m);
+
+ inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
+ inp = ggml_cont_3d(
+ ctx0, inp,
+ 3*patch_size* patch_size, Hm * Wm * m * m, 1);
+ }
+ inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+
+ if (model.patch_bias) {
+ inp = ggml_add(ctx0, inp, model.patch_bias);
+ }
+
+ inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+
+ ggml_tensor * inpL = inp;
+ ggml_tensor * window_mask = nullptr;
+ ggml_tensor * window_idx = nullptr;
+ ggml_tensor * inv_window_idx = nullptr;
+
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ // pre-layernorm
+ if (model.pre_ln_w) {
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+ }
+ if (use_window_attn) {
+ inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+ ggml_set_name(inv_window_idx, "inv_window_idx");
+ ggml_set_input(inv_window_idx);
+ // mask for window attention
+ window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
+ ggml_set_name(window_mask, "window_mask");
+ ggml_set_input(window_mask);
+
+ // if flash attn is used, we need to pad the mask and cast to f16
+ if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+ window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+ }
+
+ // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+ GGML_ASSERT(batch_size == 1);
+ inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
+ inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ // loop over layers
+ for (int il = 0; il < n_layer; il++) {
+ const auto & layer = model.layers[il];
+ const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
+
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+ // layernorm1
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+ // self-attention
+ {
+ ggml_tensor * Qcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
+ ggml_tensor * Kcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
+ ggml_tensor * Vcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
+
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+ ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+ }
+ // re-add the layer input, e.g., residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ inpL = cur; // inpL = residual, cur = hidden_states
+
+ // layernorm2
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+
+ // ffn
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ nullptr, nullptr,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ // residual 2
+ cur = ggml_add(ctx0, inpL, cur);
+
+ inpL = cur;
+ }
+
+ ggml_tensor * embeddings = inpL;
+ if (use_window_attn) {
+ const int spatial_merge_unit = 4;
+ window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
+ ggml_set_name(window_idx, "window_idx");
+ ggml_set_input(window_idx);
+ GGML_ASSERT(batch_size == 1);
+ embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
+ embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
+ cb(embeddings, "window_order_restored", -1);
+ }
+
+ // post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
+ if (model.post_ln_w) {
+ embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+ }
+
+ // Now apply merger (VLPatchMerger):
+ // 1. Apply RMS norm (ln_q in VLPatchMerger)
+ embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
+ cb(embeddings, "merger_normed", -1);
+
+ // 2. First reshape for spatial merge (merge 2x2 patches)
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+ cb(embeddings, "merger_reshaped", -1);
+
+ embeddings = build_ffn(embeddings,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ FFN_GELU,
+ -1);
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/mtmd-audio.cpp b/llama.cpp/tools/mtmd/mtmd-audio.cpp
new file mode 100644
index 0000000..e8eef03
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-audio.cpp
@@ -0,0 +1,730 @@
+#include "mtmd-audio.h"
+
+#define _USE_MATH_DEFINES // for M_PI
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <thread>
+#include <vector>
+#include <fstream>
+#include <algorithm>
+
+// some of the code here is copied from whisper.cpp
+
+constexpr bool DEBUG = false;
+
+void mtmd_audio_cache::fill_sin_cos_table(int n) {
+ sin_vals.resize(n);
+ cos_vals.resize(n);
+ for (int i = 0; i < n; i++) {
+ double theta = (2 * M_PI * i) / n;
+ sin_vals[i] = sinf(theta);
+ cos_vals[i] = cosf(theta);
+ }
+}
+
+void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
+ hann_window.resize(length);
+ int offset = -1;
+ if (periodic) {
+ offset = 0;
+ }
+ for (int i = 0; i < length; i++) {
+ hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+ }
+}
+
+void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel,
+ int n_fft,
+ int sample_rate,
+ float fmin,
+ float fmax,
+ bool slaney_area_norm,
+ float scale) {
+ GGML_ASSERT(n_mel > 0 && n_fft > 1);
+ if (fmax <= 0.0f) {
+ fmax = 0.5f * sample_rate;
+ }
+
+ // Slaney scale (matches librosa default)
+ const double min_log_hz = 1000.0;
+ const double lin_slope = 3 / 200.;
+ const double min_log_mel = min_log_hz * lin_slope;
+ const double log_step = log(6.4) / 27.0;
+ auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
+ return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
+ };
+ auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
+ return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
+ };
+
+ // infer N_fft from n_fft_bins
+ const double bin_hz_step = double(sample_rate) / double(n_fft);
+
+ // mel grid: n_mel + 2 edges
+ const double m_lo = hz_to_mel(fmin);
+ const double m_hi = hz_to_mel(fmax);
+ std::vector<double> mel_pts(n_mel + 2);
+ for (int i = 0; i < n_mel + 2; ++i) {
+ mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
+ }
+
+ // convert to Hz
+ std::vector<double> hz_pts(n_mel + 2);
+ for (int i = 0; i < n_mel + 2; ++i) {
+ hz_pts[i] = mel_to_hz(mel_pts[i]);
+ }
+
+ const int n_fft_bins = n_fft / 2 + 1;
+
+ // filterbank
+ std::vector<float> out(n_mel * n_fft_bins, 0);
+ for (int m = 0; m < n_mel; ++m) {
+ const double f_left = hz_pts[m];
+ const double f_center = hz_pts[m + 1];
+ const double f_right = hz_pts[m + 2];
+
+ const double denom_l = std::max(1e-30, f_center - f_left);
+ const double denom_r = std::max(1e-30, f_right - f_center);
+ const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
+
+ for (int k = 0; k < n_fft_bins; ++k) {
+ const double f = k * bin_hz_step;
+ double w = 0.0;
+ if (f >= f_left && f <= f_center) {
+ w = (f - f_left) / denom_l;
+ } else if (f > f_center && f <= f_right) {
+ w = (f_right - f) / denom_r;
+ }
+ out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
+ }
+ }
+
+ filters.n_mel = n_mel;
+ filters.n_fft = n_fft;
+ filters.data = std::move(out);
+
+ if (DEBUG) { // debug
+ for (size_t i = 0; i < filters.data.size(); ++i) {
+ if (filters.data[i] != 0.0f) {
+ printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+ }
+ }
+ }
+}
+
+// Unified DFT implementation for both forward and inverse transforms
+// Template parameters:
+// Inverse: false = DFT with exp(-2πi·k·n/N), no scaling
+// true = IDFT with exp(+2πi·k·n/N), scales by 1/N
+// RealInput: true = input is real-valued (stride 1), avoids imaginary computations
+// false = input is complex-valued (interleaved real/imag, stride 2)
+template <bool Inverse, bool RealInput>
+static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
+ const int n_sin_cos_vals = cache.sin_vals.size();
+ const int sin_cos_step = n_sin_cos_vals / N;
+
+ constexpr float sign = Inverse ? 1.0f : -1.0f;
+ const float scale = Inverse ? (1.0f / N) : 1.0f;
+
+ for (int k = 0; k < N; k++) {
+ float re = 0;
+ float im = 0;
+
+ for (int n = 0; n < N; n++) {
+ int idx = (k * n * sin_cos_step) % n_sin_cos_vals;
+ float cos_val = cache.cos_vals[idx];
+ float sin_val = cache.sin_vals[idx];
+
+ if constexpr (RealInput) {
+ // Real input: in_im = 0, simplifies to:
+ // re += in_re * cos_val
+ // im += sign * in_re * sin_val
+ float in_re = in[n];
+ re += in_re * cos_val;
+ im += sign * in_re * sin_val;
+ } else {
+ float in_re = in[n * 2 + 0];
+ float in_im = in[n * 2 + 1];
+ // (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
+ re += in_re * cos_val - sign * in_im * sin_val;
+ im += sign * in_re * sin_val + in_im * cos_val;
+ }
+ }
+
+ out[k * 2 + 0] = re * scale;
+ out[k * 2 + 1] = im * scale;
+ }
+}
+
+// Cooley-Tukey FFT/IFFT unified implementation
+// Template parameters:
+// Inverse: false = FFT with exp(-2πi·k/N), no scaling
+// true = IFFT with exp(+2πi·k/N), scales by 0.5 at each level
+// RealInput: true = input is real-valued (stride 1)
+// false = input is complex-valued (interleaved real/imag, stride 2)
+template <bool Inverse, bool RealInput>
+static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
+ const int n_sin_cos_vals = cache.sin_vals.size();
+
+ if (N == 1) {
+ out[0] = in[0];
+ if constexpr (RealInput) {
+ out[1] = 0.0f;
+ } else {
+ out[1] = in[1];
+ }
+ return;
+ }
+
+ const int half_N = N / 2;
+ if (N - half_N * 2 == 1) {
+ // Odd N: fall back to DFT
+ dft_impl<Inverse, RealInput>(cache, in, N, out);
+ return;
+ }
+
+ // Split into even and odd
+ if constexpr (RealInput) {
+ // Real input: stride is 1, copy only real values
+ float * even = in + N;
+ for (int i = 0; i < half_N; ++i) {
+ even[i] = in[2 * i];
+ }
+ float * even_fft = out + 2 * N;
+ fft_impl<Inverse, true>(cache, even, half_N, even_fft);
+
+ float * odd = even;
+ for (int i = 0; i < half_N; ++i) {
+ odd[i] = in[2 * i + 1];
+ }
+ float * odd_fft = even_fft + N;
+ fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
+ } else {
+ // Complex input: stride is 2, copy complex pairs
+ float * even = in + N * 2;
+ for (int i = 0; i < half_N; ++i) {
+ even[i * 2 + 0] = in[2 * i * 2 + 0];
+ even[i * 2 + 1] = in[2 * i * 2 + 1];
+ }
+ float * even_fft = out + 2 * N;
+ fft_impl<Inverse, false>(cache, even, half_N, even_fft);
+
+ float * odd = even;
+ for (int i = 0; i < half_N; ++i) {
+ odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
+ odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
+ }
+ float * odd_fft = even_fft + N;
+ fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
+ }
+
+ float * even_fft = out + 2 * N;
+ float * odd_fft = even_fft + N;
+
+ const int sin_cos_step = n_sin_cos_vals / N;
+
+ constexpr float sign = Inverse ? 1.0f : -1.0f;
+ constexpr float scale = Inverse ? 0.5f : 1.0f;
+
+ for (int k = 0; k < half_N; k++) {
+ int idx = k * sin_cos_step; // t = 2*M_PI*k/N
+ float re = cache.cos_vals[idx];
+ float im = sign * cache.sin_vals[idx];
+
+ float re_odd = odd_fft[2 * k + 0];
+ float im_odd = odd_fft[2 * k + 1];
+
+ out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd);
+ out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd);
+
+ out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd);
+ out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd);
+ }
+}
+
+// Forward FFT for real input (used by mel spectrogram)
+static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
+ fft_impl<false, true>(cache, in, N, out);
+}
+
+// Inverse FFT for complex input
+static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
+ fft_impl<true, false>(cache, in, N, out);
+}
+
+struct filter_params {
+ int32_t n_mel;
+ int32_t n_fft_bins;
+ int32_t hann_window_size;
+ int32_t hop_length;
+ int32_t sample_rate;
+ bool center_padding = false;
+ float preemph = 0.f;
+ bool use_natural_log = false;
+ bool norm_per_feature = false;
+};
+
+static void log_mel_spectrogram_worker_thread(int ith,
+ const float * hann,
+ const std::vector<float> & samples,
+ int n_samples,
+ int frame_size,
+ int frame_step,
+ int n_threads,
+ const filter_params & params,
+ const mtmd_audio_cache & cache,
+ mtmd_audio_mel & out) {
+ std::vector<float> fft_in(frame_size * 2, 0.0);
+ std::vector<float> fft_out(frame_size * 2 * 2 * 2);
+
+ int n_fft_bins = params.n_fft_bins;
+ int i = ith;
+
+ const auto & filters = cache.filters;
+
+ // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
+ GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
+ GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
+ // calculate FFT only when fft_in are not all zero
+ for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
+ const int offset = i * frame_step;
+
+ // apply Hann window (~10% faster)
+ for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
+ fft_in[j] = hann[j] * samples[offset + j];
+ }
+
+ // fill the rest with zeros
+ if (n_samples - offset < frame_size) {
+ std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
+ }
+
+ // FFT
+ fft(cache, fft_in.data(), frame_size, fft_out.data());
+
+ // Calculate modulus^2 of complex numbers
+ // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
+ for (int j = 0; j < n_fft_bins; j++) {
+ fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
+ }
+
+ // mel spectrogram
+ for (int j = 0; j < out.n_mel; j++) {
+ double sum = 0.0;
+ // unroll loop (suggested by GH user @lunixbochs)
+ int k = 0;
+ for (k = 0; k < n_fft_bins - 3; k += 4) {
+ size_t idx = size_t(j) * size_t(n_fft_bins) + size_t(k);
+ sum +=
+ fft_out[k + 0] * filters.data[idx + 0] +
+ fft_out[k + 1] * filters.data[idx + 1] +
+ fft_out[k + 2] * filters.data[idx + 2] +
+ fft_out[k + 3] * filters.data[idx + 3];
+ }
+ // handle n_fft remainder
+ for (; k < n_fft_bins; k++) {
+ sum += fft_out[k] * filters.data[j * n_fft_bins + k];
+ }
+ sum = params.use_natural_log
+ ? log(sum + 5.960464477539063e-08)
+ : log10(std::max(sum, 1e-10));
+ out.data[j * out.n_len + i] = sum;
+ }
+ }
+
+ // Otherwise fft_out are all zero
+ double sum = params.use_natural_log ? log(1e-10) : log10(1e-10);
+ for (; i < out.n_len; i += n_threads) {
+ for (int j = 0; j < out.n_mel; j++) {
+ out.data[j * out.n_len + i] = sum;
+ }
+ }
+}
+
+// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
+static bool log_mel_spectrogram(
+ const float * samples,
+ const int n_samples_in,
+ const int n_threads,
+ const filter_params & params,
+ const mtmd_audio_cache & cache,
+ mtmd_audio_mel & out) {
+ //const int64_t t_start_us = ggml_time_us();
+
+ out.n_len_org = n_samples_in;
+ int n_samples = n_samples_in;
+
+ // Hann window
+ const float * hann = cache.hann_window.data();
+ const int frame_size = (params.n_fft_bins - 1) * 2;
+ const int frame_step = params.hop_length;
+
+ // Padding
+ std::vector<float> samples_padded;
+ if (params.center_padding) {
+ const auto pad_amount = frame_size / 2;
+ samples_padded = std::vector<float>(n_samples + 2 * pad_amount, 0);
+ std::copy(samples, samples + n_samples, samples_padded.data() + pad_amount);
+ samples = samples_padded.data();
+ n_samples = samples_padded.size();
+ } else {
+ // existing padding logic
+ int64_t stage_1_pad = params.sample_rate * 30;
+ int64_t stage_2_pad = frame_size / 2;
+ samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
+ std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
+ // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
+ std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
+ // reflective pad 200 samples at the beginning of audio
+ if (n_samples < stage_2_pad + 1) {
+ // TODO: Handle short audio differently or return error
+ return false;
+ }
+ std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
+ }
+
+ // preemphasis
+ if (params.preemph) {
+ const int pad_amount = frame_size / 2;
+ const float preemph = 0.97f;
+ float prev = samples_padded[pad_amount];
+ for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
+ float cur = samples_padded[i];
+ samples_padded[i] = cur - preemph * prev;
+ prev = cur;
+ }
+ }
+
+ // pad hann window if it's smaller than frame_size
+ // TODO: probably unnecessary here? (or better doing it in g_cache?)
+ std::vector<float> hann_window_padded;
+ if (params.hann_window_size < frame_size) {
+ hann_window_padded.resize(frame_size);
+ const int padding = (frame_size - params.hann_window_size) / 2;
+ std::copy(hann, hann + params.hann_window_size, &hann_window_padded[padding]);
+ hann = hann_window_padded.data();
+ }
+
+
+ out.n_mel = params.n_mel;
+ out.n_len = (n_samples - frame_size) / frame_step + 1;
+ // TODO: handle these checks better
+ if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) {
+ LOG_ERR("%s: size overflow\n", __func__);
+ return false;
+ }
+ if (n_samples < frame_size) {
+ LOG_ERR("%s: not enough samples after padding\n", __func__);
+ return false;
+ }
+ out.data.resize(out.n_mel * out.n_len);
+
+ {
+ std::vector<std::thread> workers(n_threads - 1);
+ for (int iw = 0; iw < n_threads - 1; ++iw) {
+ workers[iw] =
+ std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples,
+ frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out));
+ }
+
+ // main thread
+ log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params,
+ cache, out);
+ for (int iw = 0; iw < n_threads - 1; ++iw) {
+ workers[iw].join();
+ }
+ }
+
+ const int effective_n_len = n_samples_in / frame_step;
+ if (params.norm_per_feature) {
+ for (int i = 0; i < out.n_mel; i++) {
+ double mean = 0;
+ for (int j = 0; j < effective_n_len; ++j) {
+ mean += out.data[i * out.n_len + j];
+ }
+ mean /= effective_n_len;
+
+ double var = 0.0;
+ for (int j = 0; j < effective_n_len; ++j) {
+ const double value = out.data[i * out.n_len + j] - mean;
+ var += value * value;
+ }
+ var /= effective_n_len - 1; // unbiased
+ const double mstd = std::sqrt(var + 1e-5);
+
+ for (int j = 0; j < effective_n_len; ++j) {
+ auto &value = out.data[i * out.n_len + j];
+ value = (value - mean) / mstd;
+ }
+
+ // pad the rest with zeros
+ for (int j = effective_n_len; j < out.n_len; ++j) {
+ out.data[i * out.n_len + j] = 0.0;
+ }
+ }
+ } else {
+ // clamping and normalization
+ double mmax = -1e20;
+ for (int i = 0; i < out.n_mel*out.n_len; i++) {
+ if (out.data[i] > mmax) {
+ mmax = out.data[i];
+ }
+ }
+
+ mmax -= 8.0;
+
+ for (int i = 0; i < out.n_mel*out.n_len; i++) {
+ if (out.data[i] < mmax) {
+ out.data[i] = mmax;
+ }
+ out.data[i] = (out.data[i] + 4.0)/4.0;
+ }
+ }
+
+ // Dump log_mel_spectrogram
+ if (DEBUG) {
+ std::ofstream outFile("log_mel_spectrogram.json");
+ outFile << "[";
+ for (uint64_t i = 0; i < out.data.size() - 1; i++) {
+ outFile << out.data[i] << ", ";
+ }
+ outFile << out.data[out.data.size() - 1] << "]";
+ outFile.close();
+ }
+
+ return true;
+}
+
+//
+// mtmd_audio_preprocessor_whisper
+//
+
+void mtmd_audio_preprocessor_whisper::initialize() {
+ cache.fill_sin_cos_table(hparams.audio_n_fft);
+ cache.fill_hann_window(hparams.audio_window_len, true);
+ cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+}
+
+bool mtmd_audio_preprocessor_whisper::preprocess(const float * samples,
+ size_t n_samples,
+ std::vector<mtmd_audio_mel> & output) {
+ if (n_samples == 0) {
+ // empty audio
+ return false;
+ }
+
+ std::vector<float> smpl;
+ // if input is too short, pad with zeros
+ // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
+ // TODO: maybe handle this better
+ size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
+ if (n_samples < min_samples) {
+ smpl.resize(min_samples, 0.0f);
+ std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
+ samples = smpl.data();
+ n_samples = smpl.size();
+ }
+
+ filter_params params;
+ params.n_mel = hparams.n_mel_bins;
+ params.n_fft_bins = 1 + (hparams.audio_n_fft / 2);
+ params.hann_window_size = hparams.audio_window_len;
+ params.hop_length = hparams.audio_hop_len;
+ params.sample_rate = hparams.audio_sample_rate;
+ params.center_padding = false;
+ params.preemph = 0.0f; // disabled
+ params.use_natural_log = false;
+ params.norm_per_feature = false;
+
+ // make sure the cache is initialized
+ GGML_ASSERT(!cache.sin_vals.empty());
+ GGML_ASSERT(!cache.cos_vals.empty());
+ GGML_ASSERT(!cache.filters.data.empty());
+
+ mtmd_audio_mel out_full;
+ bool ok = log_mel_spectrogram(samples, n_samples,
+ 4, // n_threads
+ params, cache, out_full);
+ if (!ok) {
+ return false;
+ }
+
+ // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
+ // we always expect the mel to have 3000 silent frames at the end
+ if (DEBUG) {
+ printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
+ }
+ const size_t frames_per_chunk = 3000;
+ GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
+ for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
+ int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
+ if ((size_t) n_len < frames_per_chunk) {
+ break; // last uncomplete chunk will always be a padded chunk, safe to ignore
+ }
+
+ mtmd_audio_mel out_chunk;
+ out_chunk.n_len = n_len;
+ out_chunk.n_mel = out_full.n_mel;
+ out_chunk.n_len_org = out_full.n_mel; // unused
+ out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
+
+ for (int i = 0; i < out_full.n_mel; i++) {
+ auto src = out_full.data.begin() + i * out_full.n_len + off;
+ out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
+ }
+
+ output.push_back(std::move(out_chunk));
+ }
+
+ return true;
+}
+
+//
+// mtmd_audio_preprocessor_conformer
+//
+
+void mtmd_audio_preprocessor_conformer::initialize() {
+ cache.fill_sin_cos_table(hparams.audio_n_fft);
+ cache.fill_hann_window(hparams.audio_window_len, true);
+ cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+}
+
+bool mtmd_audio_preprocessor_conformer::preprocess(const float * samples,
+ size_t n_samples,
+ std::vector<mtmd_audio_mel> & output) {
+ // empty audio
+ if (n_samples == 0) {
+ return false;
+ }
+
+ filter_params params;
+ params.n_mel = hparams.n_mel_bins;
+ params.n_fft_bins = 1 + (hparams.audio_n_fft / 2);
+ params.hann_window_size = hparams.audio_window_len;
+ params.hop_length = hparams.audio_hop_len;
+ params.sample_rate = hparams.audio_sample_rate;
+ params.center_padding = true;
+ params.preemph = 0.97f;
+ params.use_natural_log = true;
+ params.norm_per_feature = true;
+
+ // make sure the cache is initialized
+ GGML_ASSERT(!cache.sin_vals.empty());
+ GGML_ASSERT(!cache.cos_vals.empty());
+ GGML_ASSERT(!cache.filters.data.empty());
+
+ mtmd_audio_mel out_full;
+ bool ok = log_mel_spectrogram(samples, n_samples,
+ 4, // n_threads
+ params, cache, out_full);
+ if (!ok) {
+ return false;
+ }
+
+ output.push_back(std::move(out_full));
+ return true;
+}
+
+//
+// mtmd_audio_streaming_istft implementation
+//
+
+mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
+ n_fft(n_fft),
+ hop_length(hop_length),
+ n_fft_bins(n_fft / 2 + 1),
+ overlap_buffer(n_fft, 0.0f),
+ window_sum_buffer(n_fft, 0.0f),
+ padding_to_remove((n_fft - hop_length) / 2),
+ ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT
+ ifft_out(n_fft * 2 * 4, 0.0f) {
+ cache.fill_sin_cos_table(n_fft);
+ cache.fill_hann_window(n_fft, true);
+}
+
+void mtmd_audio_streaming_istft::reset() {
+ std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
+ std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
+ padding_to_remove = (n_fft - hop_length) / 2;
+}
+
+std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
+ std::vector<float> output(hop_length);
+
+ // copy frequencies
+ for (int j = 0; j < n_fft_bins; j++) {
+ ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
+ ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
+ }
+
+ // mirror negative frequencies
+ for (int j = 1; j < n_fft_bins - 1; j++) {
+ int mirror_idx = n_fft - j;
+ ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
+ ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1]; // conjugate
+ }
+
+ ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
+
+ // update window sum and overlap buffer
+ for (int j = 0; j < n_fft; j++) {
+ window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
+ overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
+ }
+
+ // extract hop_length samples with normalization
+ for (int i = 0; i < hop_length; i++) {
+ if (window_sum_buffer[i] > 1e-8f) {
+ output[i] = overlap_buffer[i] / window_sum_buffer[i];
+ } else {
+ output[i] = overlap_buffer[i];
+ }
+ }
+
+ // shift buffers left by hop_length
+ std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
+ std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
+
+ std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
+ std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
+
+ // Remove padding if needed
+ int to_remove = std::min(padding_to_remove, (int) output.size());
+ padding_to_remove -= to_remove;
+ output.erase(output.begin(), output.begin() + to_remove);
+
+ return output;
+}
+
+std::vector<float> mtmd_audio_streaming_istft::flush() {
+ std::vector<float> output;
+
+ // Extract remaining samples from overlap buffer
+ // Continue until we've extracted all meaningful samples
+ int remaining = n_fft - hop_length;
+ while (remaining > 0) {
+ int chunk_size = std::min(remaining, hop_length);
+
+ for (int i = 0; i < chunk_size; i++) {
+ float sample;
+ if (window_sum_buffer[i] > 1e-8f) {
+ sample = overlap_buffer[i] / window_sum_buffer[i];
+ } else {
+ sample = overlap_buffer[i];
+ }
+ output.push_back(sample);
+ }
+
+ // Shift buffers
+ std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
+ std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
+
+ std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
+ std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
+
+ remaining -= chunk_size;
+ }
+
+ return output;
+}
diff --git a/llama.cpp/tools/mtmd/mtmd-audio.h b/llama.cpp/tools/mtmd/mtmd-audio.h
new file mode 100644
index 0000000..016c739
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-audio.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include "ggml.h"
+#include "clip-model.h"
+
+#include <cstdint>
+#include <vector>
+#include <string>
+
+#define MTMD_INTERNAL_HEADER
+
+struct mtmd_audio_mel {
+ int n_len;
+ int n_len_org;
+ int n_mel;
+
+ std::vector<float> data;
+};
+
+struct mtmd_audio_mel_filters {
+ int32_t n_mel;
+ int32_t n_fft;
+
+ std::vector<float> data;
+};
+
+// cache for audio processing, each processor instance owns its own cache
+struct mtmd_audio_cache {
+ std::vector<float> sin_vals;
+ std::vector<float> cos_vals;
+
+ std::vector<float> hann_window;
+
+ mtmd_audio_mel_filters filters;
+
+ void fill_sin_cos_table(int n);
+
+ void fill_hann_window(int length, bool periodic);
+
+ // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
+ // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
+ void fill_mel_filterbank_matrix(int n_mel,
+ int n_fft,
+ int sample_rate, // e.g. 16000
+ float fmin = 0.0f, // e.g. 0.0
+ float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
+ bool slaney_area_norm = true,
+ float scale = 1.0f // optional extra scaling
+ );
+};
+
+struct mtmd_audio_preprocessor {
+ const clip_hparams & hparams;
+
+ mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
+
+ virtual ~mtmd_audio_preprocessor() = default;
+ virtual void initialize() = 0; // NOT thread-safe
+ virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
+};
+
+struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
+ mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
+ void initialize() override;
+ bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
+
+ private:
+ mtmd_audio_cache cache;
+};
+
+struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
+ mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
+ void initialize() override;
+ bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
+
+ private:
+ mtmd_audio_cache cache;
+};
+
+//
+// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
+//
+struct mtmd_audio_streaming_istft {
+ mtmd_audio_streaming_istft(int n_fft, int hop_length);
+
+ // reset streaming state
+ void reset();
+
+ // process a single STFT frame (streaming)
+ // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
+ // returns: up to hop_length samples
+ std::vector<float> process_frame(const float * frame_spectrum);
+
+ // flush remaining samples at end of stream
+ std::vector<float> flush();
+
+ private:
+ int n_fft;
+ int hop_length;
+ int n_fft_bins;
+
+ // Own cache for output processing
+ mtmd_audio_cache cache;
+
+ // Streaming state
+ std::vector<float> overlap_buffer;
+ std::vector<float> window_sum_buffer;
+ int padding_to_remove;
+
+ // Working buffers for IFFT
+ std::vector<float> ifft_in;
+ std::vector<float> ifft_out;
+};
diff --git a/llama.cpp/tools/mtmd/mtmd-cli.cpp b/llama.cpp/tools/mtmd/mtmd-cli.cpp
new file mode 100644
index 0000000..054c7fa
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-cli.cpp
@@ -0,0 +1,437 @@
+#include "arg.h"
+#include "debug.h"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "llama.h"
+#include "ggml.h"
+#include "console.h"
+#include "chat.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+
+#include <vector>
+#include <limits.h>
+#include <cinttypes>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
+// volatile, because of signal being an interrupt
+static volatile bool g_is_generating = false;
+static volatile bool g_is_interrupted = false;
+
+/**
+ * Please note that this is NOT a production-ready stuff.
+ * It is a playground for trying multimodal support in llama.cpp.
+ * For contributors: please keep this code simple and easy to understand.
+ */
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+ LOG(
+ "Experimental CLI for multimodal\n\n"
+ "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
+ " -m and --mmproj are required\n"
+ " -hf user/repo can replace both -m and --mmproj in most cases\n"
+ " --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
+ " to disable using GPU for mmproj model, add --no-mmproj-offload\n",
+ argv[0]
+ );
+}
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+static void sigint_handler(int signo) {
+ if (signo == SIGINT) {
+ if (g_is_generating) {
+ g_is_generating = false;
+ } else {
+ console::cleanup();
+ if (g_is_interrupted) {
+ _exit(1);
+ }
+ g_is_interrupted = true;
+ }
+ }
+}
+#endif
+
+struct mtmd_cli_context {
+ mtmd::context_ptr ctx_vision;
+ common_init_result_ptr llama_init;
+
+ llama_model * model;
+ llama_context * lctx;
+ const llama_vocab * vocab;
+ common_sampler * smpl;
+ llama_batch batch;
+ int n_batch;
+
+ mtmd::bitmaps bitmaps;
+
+ // chat template
+ common_chat_templates_ptr tmpls;
+ std::vector<common_chat_msg> chat_history;
+ bool use_jinja = false;
+ // TODO: support for --system-prompt with /clear command
+
+ // support for legacy templates (models not having EOT token)
+ llama_tokens antiprompt_tokens;
+
+ int n_threads = 1;
+ llama_pos n_past = 0;
+
+ base_callback_data cb_data;
+
+ mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
+ model = llama_init->model();
+ lctx = llama_init->context();
+ vocab = llama_model_get_vocab(model);
+ smpl = common_sampler_init(model, params.sampling);
+ n_threads = params.cpuparams.n_threads;
+ batch = llama_batch_init(1, 0, 1); // batch for next token generation
+ n_batch = params.n_batch;
+
+ if (!model || !lctx) {
+ exit(1);
+ }
+
+ if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
+ LOG_ERR("Model does not have chat template.\n");
+ LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
+ LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
+ LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
+ exit(1);
+ }
+
+ tmpls = common_chat_templates_init(model, params.chat_template);
+ use_jinja = params.use_jinja;
+ chat_history.clear();
+ LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
+
+ init_vision_context(params);
+
+ // load antiprompt tokens for legacy templates
+ if (params.chat_template == "vicuna") {
+ antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true);
+ } else if (params.chat_template == "deepseek") {
+ antiprompt_tokens = common_tokenize(lctx, "###", false, true);
+ }
+ }
+
+ ~mtmd_cli_context() {
+ llama_batch_free(batch);
+ common_sampler_free(smpl);
+ }
+
+ void init_vision_context(common_params & params) {
+ const char * clip_path = params.mmproj.path.c_str();
+ mtmd_context_params mparams = mtmd_context_params_default();
+ mparams.use_gpu = params.mmproj_use_gpu;
+ mparams.print_timings = true;
+ mparams.n_threads = params.cpuparams.n_threads;
+ mparams.flash_attn_type = params.flash_attn_type;
+ mparams.warmup = params.warmup;
+ mparams.image_min_tokens = params.image_min_tokens;
+ mparams.image_max_tokens = params.image_max_tokens;
+ if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) {
+ mparams.cb_eval_user_data = &cb_data;
+ mparams.cb_eval = common_debug_cb_eval<false>;
+ }
+ ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
+ if (!ctx_vision.get()) {
+ LOG_ERR("Failed to load vision model from %s\n", clip_path);
+ exit(1);
+ }
+ }
+
+ bool check_antiprompt(const llama_tokens & generated_tokens) {
+ if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) {
+ return false;
+ }
+ return std::equal(
+ generated_tokens.end() - antiprompt_tokens.size(),
+ generated_tokens.end(),
+ antiprompt_tokens.begin()
+ );
+ }
+
+ bool load_media(const std::string & fname) {
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
+ if (!bmp.ptr) {
+ return false;
+ }
+ bitmaps.entries.push_back(std::move(bmp));
+ return true;
+ }
+};
+
+static int generate_response(mtmd_cli_context & ctx, int n_predict) {
+ llama_tokens generated_tokens;
+ for (int i = 0; i < n_predict; i++) {
+ if (i > n_predict || !g_is_generating || g_is_interrupted) {
+ LOG("\n");
+ break;
+ }
+
+ llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, -1);
+ generated_tokens.push_back(token_id);
+ common_sampler_accept(ctx.smpl, token_id, true);
+
+ if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
+ LOG("\n");
+ break; // end of generation
+ }
+
+ LOG("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
+ fflush(stdout);
+
+ if (g_is_interrupted) {
+ LOG("\n");
+ break;
+ }
+
+ // eval the token
+ common_batch_clear(ctx.batch);
+ common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
+ if (llama_decode(ctx.lctx, ctx.batch)) {
+ LOG_ERR("failed to decode token\n");
+ return 1;
+ }
+ }
+
+ std::string generated_text = common_detokenize(ctx.lctx, generated_tokens);
+ common_chat_msg msg;
+ msg.role = "assistant";
+ msg.content = generated_text;
+ ctx.chat_history.push_back(std::move(msg));
+
+ return 0;
+}
+
+static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
+ LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
+ new_msg.role.c_str(), new_msg.content.c_str());
+ auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history,
+ new_msg, new_msg.role == "user",
+ ctx.use_jinja);
+ ctx.chat_history.push_back(new_msg);
+ return formatted;
+}
+
+static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
+ bool add_bos = ctx.chat_history.empty();
+ auto formatted_chat = chat_add_and_format(ctx, msg);
+ LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
+
+ mtmd_input_text text;
+ text.text = formatted_chat.c_str();
+ text.add_special = add_bos;
+ text.parse_special = true;
+
+ if (g_is_interrupted) return 0;
+
+ mtmd::input_chunks chunks(mtmd_input_chunks_init());
+ auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
+ int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
+ chunks.ptr.get(), // output
+ &text, // text
+ bitmaps_c_ptr.data(),
+ bitmaps_c_ptr.size());
+ if (res != 0) {
+ LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
+ return 1;
+ }
+
+ ctx.bitmaps.entries.clear();
+
+ llama_pos new_n_past;
+ if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
+ ctx.lctx, // lctx
+ chunks.ptr.get(), // chunks
+ ctx.n_past, // n_past
+ 0, // seq_id
+ ctx.n_batch, // n_batch
+ true, // logits_last
+ &new_n_past)) {
+ LOG_ERR("Unable to eval prompt\n");
+ return 1;
+ }
+
+ ctx.n_past = new_n_past;
+
+ LOG("\n");
+
+ return 0;
+}
+
+int main(int argc, char ** argv) {
+ ggml_time_init();
+
+ common_params params;
+
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
+ return 1;
+ }
+
+ common_init();
+ mtmd_helper_log_set(common_log_default_callback, nullptr);
+
+ if (params.mmproj.path.empty()) {
+ show_additional_info(argc, argv);
+ LOG_ERR("ERR: Missing --mmproj argument\n");
+ return 1;
+ }
+
+ mtmd_cli_context ctx(params);
+ LOG_INF("%s: loading model: %s\n", __func__, params.model.path.c_str());
+
+ bool is_single_turn = !params.prompt.empty() && !params.image.empty();
+
+ int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
+
+ // Ctrl+C handling
+ {
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+ struct sigaction sigint_action;
+ sigint_action.sa_handler = sigint_handler;
+ sigemptyset (&sigint_action.sa_mask);
+ sigint_action.sa_flags = 0;
+ sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+ return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+ };
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+ }
+
+ if (g_is_interrupted) return 130;
+
+ auto eval_system_prompt_if_present = [&] {
+ if (params.system_prompt.empty()) {
+ return 0;
+ }
+
+ common_chat_msg msg;
+ msg.role = "system";
+ msg.content = params.system_prompt;
+ return eval_message(ctx, msg);
+ };
+
+ LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n");
+ LOG_WRN(" For normal use cases, please use the standard llama-cli\n");
+
+ if (eval_system_prompt_if_present()) {
+ return 1;
+ }
+
+ if (is_single_turn) {
+ g_is_generating = true;
+ if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
+ for (size_t i = 0; i < params.image.size(); i++) {
+ // most models require the marker before each image
+ // ref: https://github.com/ggml-org/llama.cpp/pull/17616
+ params.prompt = mtmd_default_marker() + params.prompt;
+ }
+ }
+
+ common_chat_msg msg;
+ msg.role = "user";
+ msg.content = params.prompt;
+ for (const auto & image : params.image) {
+ if (!ctx.load_media(image)) {
+ return 1; // error is already printed by libmtmd
+ }
+ }
+ if (eval_message(ctx, msg)) {
+ return 1;
+ }
+ if (!g_is_interrupted && generate_response(ctx, n_predict)) {
+ return 1;
+ }
+
+ } else {
+ LOG("\n Running in chat mode, available commands:");
+ if (mtmd_support_vision(ctx.ctx_vision.get())) {
+ LOG("\n /image <path> load an image");
+ }
+ if (mtmd_support_audio(ctx.ctx_vision.get())) {
+ LOG("\n /audio <path> load an audio");
+ }
+ LOG("\n /clear clear the chat history");
+ LOG("\n /quit or /exit exit the program");
+ LOG("\n");
+
+ std::string content;
+
+ while (!g_is_interrupted) {
+ g_is_generating = false;
+ LOG("\n> ");
+ console::set_display(DISPLAY_TYPE_USER_INPUT);
+ std::string line;
+ console::readline(line, false);
+ if (g_is_interrupted) break;
+ console::set_display(DISPLAY_TYPE_RESET);
+ line = string_strip(line);
+ if (line.empty()) {
+ continue;
+ }
+ if (line == "/quit" || line == "/exit") {
+ break;
+ }
+ if (line == "/clear") {
+ ctx.n_past = 0;
+ ctx.chat_history.clear();
+ llama_memory_clear(llama_get_memory(ctx.lctx), true);
+ if (eval_system_prompt_if_present()) {
+ return 1;
+ }
+ LOG("Chat history cleared\n\n");
+ continue;
+ }
+ g_is_generating = true;
+ bool is_image = line == "/image" || line.find("/image ") == 0;
+ bool is_audio = line == "/audio" || line.find("/audio ") == 0;
+ if (is_image || is_audio) {
+ if (line.size() < 8) {
+ LOG_ERR("ERR: Missing media filename\n");
+ continue;
+ }
+ std::string media_path = line.substr(7);
+ if (ctx.load_media(media_path)) {
+ LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
+ content += mtmd_default_marker();
+ }
+ // else, error is already printed by libmtmd
+ continue;
+ } else {
+ content += line;
+ }
+ common_chat_msg msg;
+ msg.role = "user";
+ msg.content = content;
+ int ret = eval_message(ctx, msg);
+ if (ret) {
+ return 1;
+ }
+ if (g_is_interrupted) break;
+ if (generate_response(ctx, n_predict)) {
+ return 1;
+ }
+ content.clear();
+ }
+ }
+ if (g_is_interrupted) LOG("\nInterrupted by user\n");
+ LOG("\n\n");
+ llama_perf_context_print(ctx.lctx);
+ return g_is_interrupted ? 130 : 0;
+}
diff --git a/llama.cpp/tools/mtmd/mtmd-helper.cpp b/llama.cpp/tools/mtmd/mtmd-helper.cpp
new file mode 100644
index 0000000..902a4b4
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-helper.cpp
@@ -0,0 +1,521 @@
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#include "mtmd.h"
+#include "mtmd-helper.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <vector>
+
+//#define MTMD_AUDIO_DEBUG
+
+#define MINIAUDIO_IMPLEMENTATION
+#ifndef MTMD_AUDIO_DEBUG
+# define MA_NO_ENCODING
+#endif
+#define MA_NO_DEVICE_IO
+#define MA_NO_RESOURCE_MANAGER
+#define MA_NO_NODE_GRAPH
+#define MA_NO_ENGINE
+#define MA_NO_GENERATION
+#define MA_API static
+#include "miniaudio/miniaudio.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb/stb_image.h"
+
+#ifdef MTMD_INTERNAL_HEADER
+#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
+#endif
+
+//
+// internal logging functions
+//
+
+struct mtmd_helper_logger {
+ ggml_log_callback default_callback = [](ggml_log_level level, const char * text, void * user_data) {
+ (void) level;
+ (void) user_data;
+ fputs(text, stderr);
+ fflush(stderr);
+ };
+
+ ggml_log_callback log_callback = default_callback;
+ void * log_callback_user_data;
+
+ void log_v(enum ggml_log_level level, const char * format, va_list args) {
+ if (format == NULL) {
+ return;
+ }
+ va_list args_copy;
+ va_copy(args_copy, args);
+ char buffer[128];
+ int len = vsnprintf(buffer, 128, format, args);
+ if (len < 128) {
+ log_callback(level, buffer, log_callback_user_data);
+ } else {
+ char * buffer2 = (char *) calloc(len + 1, sizeof(char));
+ vsnprintf(buffer2, len + 1, format, args_copy);
+ buffer2[len] = 0;
+ log_callback(level, buffer2, log_callback_user_data);
+ free(buffer2);
+ }
+ va_end(args_copy);
+ }
+
+ void log(enum ggml_log_level level, const char * format, ...) {
+ va_list args;
+ va_start(args, format);
+ log_v(level, format, args);
+ va_end(args);
+ }
+} g_logger;
+
+#define LOG_INF(...) g_logger.log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
+#define LOG_WRN(...) g_logger.log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
+#define LOG_ERR(...) g_logger.log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+
+void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data) {
+ if (log_callback == nullptr) {
+ log_callback = g_logger.default_callback;
+ }
+ g_logger.log_callback = log_callback;
+ g_logger.log_callback_user_data = user_data;
+ mtmd_log_set(log_callback, user_data);
+}
+
+//
+// helper functions
+//
+
+size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
+ size_t n_tokens = 0;
+ for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
+ auto chunk = mtmd_input_chunks_get(chunks, i);
+ n_tokens += mtmd_input_chunk_get_n_tokens(chunk);
+ }
+ return n_tokens;
+}
+
+llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
+ llama_pos n_pos = 0;
+ for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
+ auto chunk = mtmd_input_chunks_get(chunks, i);
+ n_pos += mtmd_input_chunk_get_n_pos(chunk);
+ }
+ return n_pos;
+}
+
+// helper struct to make working with embd batch easier
+// note: this will be removed after llama_batch_ext refactoring
+struct decode_embd_batch {
+ int n_pos_per_embd;
+ int n_mmproj_embd;
+ std::vector<llama_pos> pos;
+ std::vector<llama_pos> pos_view; // used by mrope
+ std::vector<int32_t> n_seq_id;
+ std::vector<llama_seq_id> seq_id_0;
+ std::vector<llama_seq_id *> seq_ids;
+ std::vector<int8_t> logits;
+ llama_batch batch;
+ decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+ pos .resize(n_tokens * n_pos_per_embd);
+ n_seq_id.resize(n_tokens);
+ seq_ids .resize(n_tokens + 1);
+ logits .resize(n_tokens);
+ seq_id_0.resize(1);
+ seq_ids [n_tokens] = nullptr;
+ batch = {
+ /*n_tokens =*/ n_tokens,
+ /*tokens =*/ nullptr,
+ /*embd =*/ embd,
+ /*pos =*/ pos.data(),
+ /*n_seq_id =*/ n_seq_id.data(),
+ /*seq_id =*/ seq_ids.data(),
+ /*logits =*/ logits.data(),
+ };
+ }
+
+ void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
+ seq_id_0[0] = seq_id;
+ for (int i = 0; i < batch.n_tokens; i++) {
+ batch.pos [i] = pos_0 + i;
+ batch.n_seq_id[i] = 1;
+ batch.seq_id [i] = seq_id_0.data();
+ batch.logits [i] = false;
+ }
+ }
+
+ // M-RoPE for image
+ void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
+ GGML_ASSERT(n_pos_per_embd == 4);
+ seq_id_0[0] = seq_id;
+ for (int y = 0; y < ny; y++) {
+ for (int x = 0; x < nx; x++) {
+ int i = y * nx + x;
+ pos[i ] = pos_0;
+ pos[i + batch.n_tokens ] = pos_0 + y;
+ pos[i + batch.n_tokens * 2] = pos_0 + x;
+ pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+ }
+ }
+ for (int i = 0; i < batch.n_tokens; i++) {
+ batch.n_seq_id[i] = 1;
+ batch.seq_id [i] = seq_id_0.data();
+ batch.logits [i] = false;
+ }
+ }
+
+ // M-RoPE for audio
+ void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
+ GGML_ASSERT(n_pos_per_embd == 4);
+ seq_id_0[0] = seq_id;
+ for (int i = 0; i < batch.n_tokens; i++) {
+ pos[i ] = pos_0 + i;
+ pos[i + batch.n_tokens ] = pos_0 + i;
+ pos[i + batch.n_tokens * 2] = pos_0 + i;
+ pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+ }
+ for (int i = 0; i < batch.n_tokens; i++) {
+ batch.n_seq_id[i] = 1;
+ batch.seq_id [i] = seq_id_0.data();
+ batch.logits [i] = false;
+ }
+ }
+
+ llama_batch get_view(int offset, int n_tokens) {
+ llama_pos * pos_ptr;
+ pos_view.clear();
+ pos_view.reserve(n_tokens * n_pos_per_embd);
+ if (n_pos_per_embd > 1) {
+ // mrope
+ // for example, with layout of src: 1234...1234...1234...1234...
+ // offset 2 will give us dst: 34...34...34...34...
+ for (int i = 0; i < n_pos_per_embd; i++) {
+ // assume n_tokens is less than or equal to batch.n_tokens
+ // batch.n_tokens is number of **total** tokens
+ // n_tokens is number of viewed token
+ size_t src_idx = i * batch.n_tokens + offset;
+ pos_view.insert(pos_view.end(),
+ pos.data() + src_idx,
+ pos.data() + src_idx + n_tokens);
+ }
+ pos_ptr = pos_view.data();
+ } else {
+ // normal
+ pos_ptr = pos.data() + offset;
+ }
+ return {
+ /*n_tokens =*/ n_tokens,
+ /*tokens =*/ nullptr,
+ /*embd =*/ batch.embd + offset * n_mmproj_embd,
+ /*pos =*/ pos_ptr,
+ /*n_seq_id =*/ batch.n_seq_id + offset,
+ /*seq_id =*/ batch.seq_id + offset,
+ /*logits =*/ batch.logits + offset,
+ };
+ }
+};
+
+// Helper function for decoding an image whose embeddings have already been calculated
+int32_t mtmd_helper_decode_image_chunk(
+ mtmd_context * ctx,
+ struct llama_context * lctx,
+ const mtmd_input_chunk * chunk,
+ float * encoded_embd,
+ llama_pos n_past,
+ llama_seq_id seq_id,
+ int32_t n_batch,
+ llama_pos * new_n_past) {
+ auto chunk_type = mtmd_input_chunk_get_type(chunk);
+ const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ LOG_ERR("failed to decode chunk: input chunk not of image/audio type\n");
+ return -1;
+ }
+
+ const llama_model * model = llama_get_model(lctx);
+ int n_mmproj_embd = llama_model_n_embd_inp(model);
+ int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
+
+ int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
+ int32_t i_batch = 0;
+ int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
+ decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+
+ if (mtmd_decode_use_mrope(ctx)) {
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+ if (!image_tokens) {
+ LOG_ERR("failed to decode chunk: image tokens are null\n");
+ return -1;
+ }
+ const int nx = mtmd_image_tokens_get_nx(image_tokens);
+ const int ny = mtmd_image_tokens_get_ny(image_tokens);
+ batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ batch_embd.set_position_mrope_1d(n_past, seq_id);
+ } else {
+ GGML_ABORT("invalid chunk type for M-RoPE");
+ }
+ } else {
+ batch_embd.set_position_normal(n_past, seq_id);
+ }
+
+ if (mtmd_decode_use_non_causal(ctx)) {
+ llama_set_causal_attn(lctx, false);
+ // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
+ }
+
+ while (i_batch < n_img_batches) { // split into batches
+ int pos_offset = i_batch*n_batch;
+ int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
+ llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
+
+ LOG_INF("decoding %s batch %d/%d, n_tokens_batch = %d\n", name, i_batch+1, n_img_batches, n_tokens_batch);
+
+ int64_t t1 = ggml_time_ms();
+ int32_t ret = llama_decode(lctx, batch_embd_view);
+ if (ret != 0) {
+ LOG_ERR("failed to decode %s\n", name);
+ llama_set_causal_attn(lctx, true); // restore causal attn
+ return ret;
+ }
+
+ LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);
+
+ i_batch++;
+ }
+
+ n_past += mtmd_input_chunk_get_n_pos(chunk);
+ *new_n_past = n_past;
+
+ if (mtmd_decode_use_non_causal(ctx)) {
+ llama_set_causal_attn(lctx, true);
+ }
+ return 0;
+}
+
+int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+ struct llama_context * lctx,
+ const mtmd_input_chunk * chunk,
+ llama_pos n_past,
+ llama_seq_id seq_id,
+ int32_t n_batch,
+ bool logits_last,
+ llama_pos * new_n_past) {
+ int32_t ret;
+ llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
+ auto chunk_type = mtmd_input_chunk_get_type(chunk);
+
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ size_t n_tokens;
+ const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+ // LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens);
+ size_t i = 0;
+ while (i < n_tokens) { // split into batches
+ text_batch.n_tokens = 0; // clear the batch
+ for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
+ int32_t j = text_batch.n_tokens;
+ text_batch.token [j] = tokens[i];
+ text_batch.pos [j] = n_past++;
+ text_batch.n_seq_id[j] = 1;
+ text_batch.seq_id [j][0] = seq_id;
+ text_batch.logits [j] = false;
+
+ text_batch.n_tokens++;
+ }
+ bool is_last_token = (i == n_tokens);
+ if (logits_last && is_last_token) {
+ text_batch.logits[text_batch.n_tokens - 1] = true;
+ }
+ ret = llama_decode(lctx, text_batch);
+ if (ret != 0) {
+ LOG_ERR("failed to decode text\n");
+ llama_batch_free(text_batch);
+ return ret;
+ }
+ *new_n_past += text_batch.n_tokens;
+ }
+
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
+ int64_t t0 = ggml_time_ms();
+
+ LOG_INF("encoding %s slice...\n", name);
+
+ ret = mtmd_encode_chunk(ctx, chunk);
+ if (ret != 0) {
+ LOG_ERR("failed to encode %s slice\n", name);
+ llama_batch_free(text_batch);
+ return ret;
+ }
+
+ LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
+
+ float * embd = mtmd_get_output_embd(ctx);
+ ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
+ if (ret != 0) {
+ LOG_ERR("failed to decode %s\n", name);
+ llama_batch_free(text_batch);
+ return ret;
+ }
+ } else {
+ GGML_ABORT("chunk type not supported");
+ }
+
+ llama_batch_free(text_batch);
+ return 0;
+}
+
+int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+ struct llama_context * lctx,
+ const mtmd_input_chunks * chunks,
+ llama_pos n_past,
+ llama_seq_id seq_id,
+ int32_t n_batch,
+ bool logits_last,
+ llama_pos * new_n_past) {
+ size_t n_chunks = mtmd_input_chunks_size(chunks);
+ if (n_chunks == 0) {
+ LOG_WRN("no chunks to eval\n");
+ return 0;
+ }
+
+ for (size_t i = 0; i < n_chunks; i++) {
+ bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
+ auto chunk = mtmd_input_chunks_get(chunks, i);
+
+ int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
+ if (res != 0) {
+ LOG_ERR("failed to eval chunk %zu\n", i);
+ return res;
+ }
+ *new_n_past = n_past;
+ }
+
+ return 0;
+}
+
+namespace audio_helpers {
+
+static bool is_audio_file(const char * buf, size_t len) {
+ if (len < 12) {
+ return false;
+ }
+
+ // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
+ // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
+ bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
+ bool is_mp3 = len >= 3 && (
+ memcmp(buf, "ID3", 3) == 0 ||
+ // Check for MPEG sync word (simplified check)
+ ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
+ );
+ bool is_flac = memcmp(buf, "fLaC", 4) == 0;
+
+ return is_wav || is_mp3 || is_flac;
+}
+
+// returns true if the buffer is a valid audio file
+static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
+ ma_result result;
+ const int channels = 1;
+ ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
+ ma_decoder decoder;
+
+ result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
+ if (result != MA_SUCCESS) {
+ return false;
+ }
+
+ ma_uint64 frame_count;
+ ma_uint64 frames_read;
+ result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
+ if (result != MA_SUCCESS) {
+ ma_decoder_uninit(&decoder);
+ return false;
+ }
+
+ pcmf32_mono.resize(frame_count);
+ result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
+ if (result != MA_SUCCESS) {
+ ma_decoder_uninit(&decoder);
+ return false;
+ }
+
+#ifdef MTMD_AUDIO_DEBUG
+ // save audio to wav file
+ ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
+ ma_encoder encoder;
+ ma_encoder_init_file("output.wav", &config, &encoder);
+ ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
+ ma_encoder_uninit(&encoder);
+#endif
+
+ ma_decoder_uninit(&decoder);
+ return true;
+}
+
+} // namespace audio_helpers
+
+mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
+ if (audio_helpers::is_audio_file((const char *)buf, len)) {
+ std::vector<float> pcmf32;
+ int bitrate = mtmd_get_audio_bitrate(ctx);
+ if (bitrate < 0) {
+ LOG_ERR("This model does not support audio input\n");
+ return nullptr;
+ }
+ if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
+ LOG_ERR("Unable to read WAV audio file from buffer\n");
+ return nullptr;
+ }
+ return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
+ }
+
+ // otherwise, we assume it's an image
+ mtmd_bitmap * result = nullptr;
+ {
+ int nx, ny, nc;
+ auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
+ if (!data) {
+ LOG_ERR("%s: failed to decode image bytes\n", __func__);
+ return nullptr;
+ }
+ result = mtmd_bitmap_init(nx, ny, data);
+ stbi_image_free(data);
+ }
+ return result;
+}
+
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
+ std::vector<unsigned char> buf;
+ FILE * f = fopen(fname, "rb");
+ if (!f) {
+ LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
+ return nullptr;
+ }
+
+ fseek(f, 0, SEEK_END);
+ long file_size = ftell(f);
+ fseek(f, 0, SEEK_SET);
+ buf.resize(file_size);
+
+ size_t n_read = fread(buf.data(), 1, file_size, f);
+ fclose(f);
+ if (n_read != (size_t)file_size) {
+ LOG_ERR("Failed to read entire file %s", fname);
+ return nullptr;
+ }
+
+ return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
+}
diff --git a/llama.cpp/tools/mtmd/mtmd-helper.h b/llama.cpp/tools/mtmd/mtmd-helper.h
new file mode 100644
index 0000000..5036b92
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-helper.h
@@ -0,0 +1,96 @@
+#ifndef MTMD_HELPER_H
+#define MTMD_HELPER_H
+
+#include "ggml.h"
+#include "llama.h"
+#include "mtmd.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// libmtmd helper functions
+//
+// Please note that these helpers are not guaranteed to be stable.
+// BREAKING CHANGES are expected.
+//
+
+// Set callback for all future logging events.
+// If this is not called, or NULL is supplied, everything is output on stderr.
+// Note: this also call mtmd_log_set() internally
+MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);
+
+// helper function to construct a mtmd_bitmap from a file
+// it calls mtmd_helper_bitmap_init_from_buf() internally
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+
+// helper function to construct a mtmd_bitmap from a buffer containing a file
+// supported formats:
+// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
+// audio: formats supported by miniaudio: wav, mp3, flac
+// note: audio files will be auto-detected based on magic bytes
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+
+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
+MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+
+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
+MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
+
+// helper function that automatically:
+// 1. run llama_decode() on text chunks
+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// otherwise, returns 0 on success
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+ struct llama_context * lctx,
+ const mtmd_input_chunks * chunks,
+ llama_pos n_past,
+ llama_seq_id seq_id,
+ int32_t n_batch,
+ bool logits_last,
+ llama_pos * new_n_past);
+
+// works like mtmd_helper_eval_chunks(), but only for a single chunk
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+ struct llama_context * lctx,
+ const mtmd_input_chunk * chunk,
+ llama_pos n_past,
+ llama_seq_id seq_id,
+ int32_t n_batch,
+ bool logits_last,
+ llama_pos * new_n_past);
+
+// helper function to decode an image whose embeddings have already been calculated
+// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
+// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
+ struct llama_context * lctx,
+ const mtmd_input_chunk * chunk,
+ float * encoded_embd,
+ llama_pos n_past,
+ llama_seq_id seq_id,
+ int32_t n_batch,
+ llama_pos * new_n_past);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+//
+// C++ wrappers
+//
+
+#endif
diff --git a/llama.cpp/tools/mtmd/mtmd.cpp b/llama.cpp/tools/mtmd/mtmd.cpp
new file mode 100644
index 0000000..b763627
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd.cpp
@@ -0,0 +1,1151 @@
+#include "clip.h"
+#include "clip-impl.h"
+#include "mtmd.h"
+#include "mtmd-audio.h"
+
+#include "llama.h"
+
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+// represents raw image data, layout is RGBRGBRGB...
+// length of data must be nx * ny * 3
+struct mtmd_bitmap {
+ uint32_t nx;
+ uint32_t ny;
+ std::vector<unsigned char> data;
+ std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
+ bool is_audio = false; // true if the bitmap is audio
+};
+
+struct mtmd_image_tokens {
+ uint32_t nx; // number of tokens in x direction
+ uint32_t ny; // number of tokens in y direction
+ bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
+ uint32_t n_tokens() const { return nx * ny; }
+ clip_image_f32_batch batch_f32; // preprocessed image patches
+ std::string id; // optional user-defined ID, useful for KV cache tracking
+
+ mtmd_image_tokens clone() {
+ return mtmd_image_tokens{
+ nx,
+ ny,
+ use_mrope_pos,
+ batch_f32.clone(),
+ id
+ };
+ }
+};
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
+
+struct mtmd_audio_tokens {
+ uint32_t n_tokens; // number of tokens
+ clip_image_f32_batch batch_f32; // preprocessed image patches
+ std::string id; // optional user-defined ID, useful for KV cache tracking
+
+ mtmd_audio_tokens clone() {
+ return mtmd_audio_tokens{
+ n_tokens,
+ batch_f32.clone(),
+ id
+ };
+ }
+};
+using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
+
+struct mtmd_input_chunk {
+ mtmd_input_chunk_type type;
+ std::vector<llama_token> tokens_text;
+ mtmd_image_tokens_ptr tokens_image;
+ mtmd_audio_tokens_ptr tokens_audio;
+};
+
+struct mtmd_input_chunks {
+ std::vector<mtmd_input_chunk> entries;
+};
+
+// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
+// models not having it (llava-1.6) will process embeddings without any special tokens in-between
+enum mtmd_slice_tmpl {
+ MTMD_SLICE_TMPL_NONE,
+ MTMD_SLICE_TMPL_MINICPMV_2_5,
+ MTMD_SLICE_TMPL_MINICPMV_2_6,
+ MTMD_SLICE_TMPL_LLAMA4,
+ MTMD_SLICE_TMPL_IDEFICS3,
+ MTMD_SLICE_TMPL_LFM2,
+};
+
+const char * mtmd_default_marker() {
+ return "<__media__>";
+}
+
+static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
+ switch (flash_attn_type) {
+ case LLAMA_FLASH_ATTN_TYPE_AUTO: return CLIP_FLASH_ATTN_TYPE_AUTO;
+ case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
+ case LLAMA_FLASH_ATTN_TYPE_ENABLED: return CLIP_FLASH_ATTN_TYPE_ENABLED;
+ }
+ return CLIP_FLASH_ATTN_TYPE_AUTO;
+}
+
+mtmd_context_params mtmd_context_params_default() {
+ mtmd_context_params params {
+ /* use_gpu */ true,
+ /* print_timings */ true,
+ /* n_threads */ 4,
+ /* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
+ /* media_marker */ mtmd_default_marker(),
+ /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
+ /* warmup */ true,
+ /* image_min_tokens */ -1,
+ /* image_max_tokens */ -1,
+ /* cb_eval */ nullptr,
+ /* cb_eval_user_data */ nullptr,
+ };
+ return params;
+}
+
+struct mtmd_context {
+ struct clip_ctx * ctx_v; // vision
+ struct clip_ctx * ctx_a; // audio
+ const struct llama_model * text_model;
+ std::vector<float> image_embd_v; // image embedding vector
+
+ bool print_timings;
+ int n_threads;
+ std::string media_marker;
+ const int n_embd_text;
+
+ // these are not token, but strings used to mark the beginning and end of image/audio embeddings
+ std::string img_beg;
+ std::string img_end;
+ std::string aud_beg;
+ std::string aud_end;
+
+ // for llava-uhd style models, we need special tokens in-between slices
+ // minicpmv calls them "slices", llama 4 calls them "tiles"
+ mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
+ std::vector<llama_token> tok_ov_img_start; // overview image
+ std::vector<llama_token> tok_ov_img_end; // overview image
+ std::vector<llama_token> tok_slices_start; // start of all slices
+ std::vector<llama_token> tok_slices_end; // end of all slices
+ std::vector<llama_token> tok_sli_img_start; // single slice start
+ std::vector<llama_token> tok_sli_img_end; // single slice end
+ std::vector<llama_token> tok_sli_img_mid; // between 2 slices
+ std::vector<llama_token> tok_row_end; // end of row
+ bool tok_row_end_trail = false;
+ bool ov_img_first = false;
+
+ // string template for slice image delimiters with row/col (idefics3)
+ std::string sli_img_start_tmpl;
+
+ std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
+
+ // TODO @ngxson : add timings
+
+ mtmd_context(const char * mmproj_fname,
+ const llama_model * text_model,
+ const mtmd_context_params & ctx_params) :
+ text_model (text_model),
+ print_timings(ctx_params.print_timings),
+ n_threads (ctx_params.n_threads),
+ media_marker (ctx_params.media_marker),
+ n_embd_text (llama_model_n_embd_inp(text_model))
+ {
+ if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
+ throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
+ }
+
+ if (media_marker.empty()) {
+ throw std::runtime_error("media_marker must not be empty");
+ }
+
+ clip_context_params ctx_clip_params {
+ /* use_gpu */ ctx_params.use_gpu,
+ /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
+ /* image_min_tokens */ ctx_params.image_min_tokens,
+ /* image_max_tokens */ ctx_params.image_max_tokens,
+ /* warmup */ ctx_params.warmup,
+ /* cb_eval */ ctx_params.cb_eval,
+ /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
+ };
+
+ auto res = clip_init(mmproj_fname, ctx_clip_params);
+ ctx_v = res.ctx_v;
+ ctx_a = res.ctx_a;
+ if (!ctx_v && !ctx_a) {
+ throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
+ }
+
+ // if both vision and audio mmproj are present, we need to validate their n_embd
+ if (ctx_v && ctx_a) {
+ int n_embd_v = clip_n_mmproj_embd(ctx_v);
+ int n_embd_a = clip_n_mmproj_embd(ctx_a);
+ if (n_embd_v != n_embd_a) {
+ throw std::runtime_error(string_format(
+ "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
+ n_embd_v, n_embd_a));
+ }
+ }
+
+ // since we already validate n_embd of vision and audio mmproj,
+ // we can safely assume that they are the same
+ int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
+ if (n_embd_text != n_embd_clip) {
+ throw std::runtime_error(string_format(
+ "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
+ "hint: you may be using wrong mmproj\n",
+ n_embd_text, n_embd_clip));
+ }
+ if (ctx_v) {
+ init_vision();
+ }
+ if (ctx_a) {
+ init_audio();
+ }
+ }
+
+ void init_vision() {
+ GGML_ASSERT(ctx_v != nullptr);
+
+ projector_type proj = clip_get_projector_type(ctx_v);
+ int minicpmv_version = clip_is_minicpmv(ctx_v);
+ if (minicpmv_version == 2) {
+ // minicpmv 2.5 format:
+ // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
+ tok_ov_img_start = {lookup_token("<image>")};
+ tok_ov_img_end = {lookup_token("</image>")};
+ tok_slices_start = {lookup_token("<slice>")};
+ tok_slices_end = {lookup_token("</slice>")};
+ tok_sli_img_start = tok_ov_img_start;
+ tok_sli_img_end = tok_ov_img_end;
+ tok_row_end = {lookup_token("\n")};
+ tok_row_end_trail = false; // no trailing end-of-row token
+ ov_img_first = true;
+
+ } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
+ // minicpmv 2.6 format:
+ // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
+ tok_ov_img_start = {lookup_token("<image>")};
+ tok_ov_img_end = {lookup_token("</image>")};
+ tok_sli_img_start = {lookup_token("<slice>")};
+ tok_sli_img_end = {lookup_token("</slice>")};
+ tok_row_end = {lookup_token("\n")};
+ tok_row_end_trail = false; // no trailing end-of-row token
+ ov_img_first = true;
+
+ } else if (minicpmv_version != 0) {
+ GGML_ASSERT(false && "unsupported minicpmv version");
+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+ // llama 4 format:
+ // <|image_start|>
+ // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+ // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+ // ... <|tile_y_separator|> <-- trailing end-of-row token
+ // <|image|> (overview) <-- overview image is last
+ // <|image_end|>
+ slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
+ tok_ov_img_start = {lookup_token("<|image|>")};
+ tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
+ tok_row_end = {lookup_token("<|tile_y_separator|>")};
+ tok_row_end_trail = true; // add trailing end-of-row token
+ ov_img_first = false; // overview image is last
+ }
+
+ // set boi/eoi
+ if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
+ // <start_of_image> ... (image embeddings) ... <end_of_image>
+ img_beg = "<start_of_image>";
+ img_end = "<end_of_image>";
+
+ } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
+ // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
+ slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
+ tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
+ tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
+ tok_row_end = {lookup_token("\n")};
+ sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
+
+ } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
+ // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
+ img_end = "[IMG_END]";
+
+ } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
+ // <|vision_start|> ... (image embeddings) ... <|vision_end|>
+ img_beg = "<|vision_start|>";
+ img_end = "<|vision_end|>";
+
+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+ // (more details in mtmd_context constructor)
+ img_beg = "<|image_start|>";
+ img_end = "<|image_end|>";
+ LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
+ " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+
+ } else if (proj == PROJECTOR_TYPE_INTERNVL) {
+ // <img> ... (image embeddings) ... </img>
+ img_beg = "<img>";
+ img_end = "</img>";
+
+ } else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
+ // <|im_start|> ... (image embeddings) ... <|im_end|>
+ img_beg = "<|im_start|>";
+ img_end = "<|im_end|>";
+
+ } else if (proj == PROJECTOR_TYPE_LFM2) {
+ // multi-tile:
+ // <|image_start|>
+ // <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
+ // <|img_thumbnail|> (thumbnail)
+ // <|image_end|>
+ // single-tile:
+ // <|image_start|> (image) <|image_end|>
+ img_beg = "<|image_start|>";
+ img_end = "<|image_end|>";
+ slice_tmpl = MTMD_SLICE_TMPL_LFM2;
+ sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
+ tok_ov_img_start = {lookup_token("<|img_thumbnail|>")};
+ ov_img_first = false;
+ } else if (proj == PROJECTOR_TYPE_GLM4V) {
+ img_beg = "<|begin_of_image|>";
+ img_end = "<|end_of_image|>";
+
+ }
+ }
+
+ void init_audio() {
+ GGML_ASSERT(ctx_a != nullptr);
+ projector_type proj = clip_get_projector_type(ctx_a);
+
+ LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
+ " https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
+
+ // set preprocessor
+ switch (proj) {
+ case PROJECTOR_TYPE_QWEN2A:
+ case PROJECTOR_TYPE_QWEN25O:
+ case PROJECTOR_TYPE_ULTRAVOX:
+ case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_GLMA:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
+ break;
+ case PROJECTOR_TYPE_LFM2A:
+ audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
+ break;
+ default:
+ GGML_ABORT("unsupported audio projector type");
+ }
+
+ // initialize audio preprocessor
+ audio_preproc->initialize();
+
+ // set special tokens
+ if (proj == PROJECTOR_TYPE_QWEN2A) {
+ // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
+ aud_beg = "<|audio_bos|>";
+ aud_end = "<|audio_eos|>";
+
+ } else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
+ // [BEGIN_AUDIO] ... (embeddings) ...
+ aud_beg = "[BEGIN_AUDIO]";
+
+ } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
+ // <sound> ... (embeddings) ...
+ aud_beg = "<sound>";
+ }
+ }
+
+ // get clip ctx based on chunk type
+ clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ return ctx_v;
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ return ctx_a;
+ }
+ GGML_ABORT("unknown chunk type");
+ }
+
+ projector_type proj_type_v() const {
+ return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
+ }
+
+ projector_type proj_type_a() const {
+ return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
+ }
+
+ ~mtmd_context() {
+ clip_free(ctx_a);
+ clip_free(ctx_v);
+ }
+
+private:
+ llama_token lookup_token(const std::string & token_text) {
+ const llama_vocab * vocab = llama_model_get_vocab(text_model);
+ const int n_vocab = llama_vocab_n_tokens(vocab);
+ for (int i = 0; i < n_vocab; i++) {
+ if (token_to_piece(vocab, i, true) == token_text) {
+ return i;
+ }
+ }
+ return LLAMA_TOKEN_NULL;
+ }
+
+ std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
+ std::string piece;
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
+ const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+ if (n_chars < 0) {
+ piece.resize(-n_chars);
+ int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+ GGML_ASSERT(check == -n_chars);
+ } else {
+ piece.resize(n_chars);
+ }
+ return piece;
+ }
+};
+
+mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+ const struct llama_model * text_model,
+ const struct mtmd_context_params ctx_params) {
+ try {
+ return new mtmd_context(mmproj_fname, text_model, ctx_params);
+ } catch (const std::exception & e) {
+ LOG_ERR("%s: error: %s\n", __func__, e.what());
+ return nullptr;
+ }
+}
+
+void mtmd_free(mtmd_context * ctx) {
+ delete ctx;
+}
+
+struct mtmd_tokenizer {
+ mtmd_context * ctx;
+ std::vector<const mtmd_bitmap *> bitmaps;
+
+ std::string input_text;
+ bool add_special;
+ bool parse_special;
+ const llama_vocab * vocab;
+
+ mtmd_input_chunks cur;
+
+ mtmd_tokenizer(mtmd_context * ctx,
+ const mtmd_input_text * text,
+ const mtmd_bitmap ** bitmaps,
+ size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
+ add_special = text->add_special;
+ parse_special = text->parse_special;
+ input_text = text->text;
+ vocab = llama_model_get_vocab(ctx->text_model);
+
+ // for compatibility, we convert image marker to media marker
+ string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
+ }
+
+ int32_t tokenize(mtmd_input_chunks * output) {
+ cur.entries.clear();
+ std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
+ size_t i_bm = 0; // index of the current bitmap
+ for (auto & part : parts) {
+ if (part == ctx->media_marker) {
+ // this is a marker, we should add the next bitmap
+ if (i_bm >= bitmaps.size()) {
+ LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
+ __func__, bitmaps.size(), parts.size() - 1);
+ return 1;
+ }
+ const mtmd_bitmap * bitmap = bitmaps[i_bm++];
+ int32_t res = add_media(bitmap);
+ if (res != 0) {
+ return res;
+ }
+ } else {
+ // this is a text part, we should add it as text
+ add_text(part, parse_special);
+ }
+ }
+
+ if (add_special && llama_vocab_get_add_bos(vocab)) {
+ // if first chunk is text, we add BOS token to first text chunk
+ // otherwise, create a new text chunk with BOS token
+ if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ // add BOS token to the beginning of first text chunk
+ cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
+ } else {
+ // create a new text chunk with BOS token at the beginning
+ mtmd_input_chunk bos_chunk{
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
+ {llama_vocab_bos(vocab)},
+ nullptr, // image tokens
+ nullptr, // audio tokens
+ };
+ cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
+ }
+ }
+
+ if (add_special && llama_vocab_get_add_eos(vocab)) {
+ // if last chunk is text, we add EOS token to it
+ add_text({llama_vocab_eos(vocab)});
+ }
+
+ if (i_bm != bitmaps.size()) {
+ LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
+ __func__, bitmaps.size(), parts.size() - 1);
+ return 1;
+ }
+
+ *output = std::move(cur);
+
+ return 0;
+ }
+
+ void add_text(const std::string & txt, bool parse_special) {
+ LOG_DBG("%s: %s\n", __func__, txt.c_str());
+ auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
+ add_text(tokens);
+ }
+
+ void add_text(const std::vector<llama_token> & tokens) {
+ if (tokens.empty()) {
+ return;
+ }
+ // if last entry is also a text chunk, add tokens to it instead of creating new chunk
+ if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ cur.entries.back().tokens_text.insert(
+ cur.entries.back().tokens_text.end(),
+ tokens.begin(),
+ tokens.end());
+ } else {
+ mtmd_input_chunk chunk{
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
+ tokens,
+ nullptr, // image tokens
+ nullptr, // audio tokens
+ };
+ cur.entries.emplace_back(std::move(chunk));
+ }
+ }
+
+ int32_t add_media(const mtmd_bitmap * bitmap) {
+ if (!bitmap->is_audio) {
+ // handle image
+
+ if (!ctx->ctx_v) {
+ LOG_ERR("%s: error: model does not support vision input\n", __func__);
+ return 2;
+ }
+
+ if (!ctx->img_beg.empty()) {
+ add_text(ctx->img_beg, true); // add image begin token
+ }
+
+ // convert mtmd_bitmap to clip_image_u8
+ clip_image_u8_ptr img_u8(clip_image_u8_init());
+ img_u8->nx = bitmap->nx;
+ img_u8->ny = bitmap->ny;
+ img_u8->buf.resize(bitmap->data.size());
+ std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
+
+ // preprocess image
+ clip_image_f32_batch batch_f32;
+ bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
+ if (!ok) {
+ LOG_ERR("Unable to preprocess image\n");
+ return 2;
+ }
+
+ // handle llava-uhd style preprocessing
+ const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
+ if (
+ ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
+ || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
+ ) {
+ const int n_col = batch_f32.grid_x;
+ const int n_row = batch_f32.grid_y;
+ // split batch into chunks of single images
+ // NOTE: batch_f32 will be invalidated after this call
+ auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
+ GGML_ASSERT(chunks.size() > 0);
+
+ auto ov_chunk = std::move(chunks.front());
+ chunks.erase(chunks.begin());
+
+ // add overview image (first)
+ if (ctx->ov_img_first) {
+ add_text(ctx->tok_ov_img_start);
+ cur.entries.emplace_back(std::move(ov_chunk));
+ add_text(ctx->tok_ov_img_end);
+ }
+
+ // add slices (or tiles)
+ if (!chunks.empty()) {
+ GGML_ASSERT((int)chunks.size() == n_row * n_col);
+ add_text(ctx->tok_slices_start);
+ for (int y = 0; y < n_row; y++) {
+ for (int x = 0; x < n_col; x++) {
+ const bool is_last_in_row = (x == n_col - 1);
+ if (!ctx->tok_sli_img_start.empty()) {
+ add_text(ctx->tok_sli_img_start);
+ } else if (!ctx->sli_img_start_tmpl.empty()) {
+ // If using a template to preceed a slice image
+ const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
+ std::unique_ptr<char[]> buf(new char[sz]);
+ std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
+ add_text(std::string(buf.get(), buf.get() + sz - 1), true);
+ }
+ cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
+ add_text(ctx->tok_sli_img_end);
+ if (!is_last_in_row) {
+ add_text(ctx->tok_sli_img_mid);
+ }
+ }
+ if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
+ add_text(ctx->tok_row_end);
+ }
+ }
+ add_text(ctx->tok_slices_end);
+ }
+
+ // add overview image (last)
+ if (!ctx->ov_img_first) {
+ add_text(ctx->tok_ov_img_start);
+ cur.entries.emplace_back(std::move(ov_chunk));
+ add_text(ctx->tok_ov_img_end);
+ }
+
+ } else {
+ size_t n_tokens = 0;
+ for (const auto & entry : batch_f32.entries) {
+ n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
+ }
+
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+ if (mtmd_decode_use_mrope(ctx)) {
+ // for Qwen2VL, we need this information for M-RoPE decoding positions
+ image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
+ image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
+ image_tokens->use_mrope_pos = true;
+ } else {
+ // other models, we only need the total number of tokens
+ image_tokens->nx = n_tokens;
+ image_tokens->ny = 1;
+ }
+ image_tokens->batch_f32 = std::move(batch_f32);
+ image_tokens->id = bitmap->id; // optional
+
+ LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
+ LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
+ LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
+
+ mtmd_input_chunk chunk{
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
+ {}, // text tokens
+ std::move(image_tokens),
+ nullptr, // audio tokens
+ };
+ cur.entries.emplace_back(std::move(chunk));
+ }
+
+ if (!ctx->img_end.empty()) {
+ add_text(ctx->img_end, true); // add image end token
+ }
+
+ } else {
+ // handle audio
+
+ if (!ctx->ctx_a) {
+ LOG_ERR("%s: error: model does not support audio input\n", __func__);
+ return 2;
+ }
+
+ if (bitmap->data.size() == 0) {
+ LOG_ERR("%s: error: empty audio data\n", __func__);
+ return 2;
+ }
+
+ if (!ctx->aud_beg.empty()) {
+ add_text(ctx->aud_beg, true); // add audio begin token
+ }
+
+ // preprocess audio
+ std::vector<mtmd_audio_mel> mel_spec_chunks;
+ const float * samples = (const float *)bitmap->data.data();
+ size_t n_samples = bitmap->data.size() / sizeof(float);
+ bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
+ if (!ok) {
+ LOG_ERR("Unable to preprocess audio\n");
+ return 2;
+ }
+
+ // consider each mel_spec as a separate audio chunk
+ // TODO: maybe support batching, but this may come with memory cost
+ for (auto & mel_spec : mel_spec_chunks) {
+ clip_image_f32_ptr mel_f32(clip_image_f32_init());
+ mel_f32->nx = mel_spec.n_len;
+ mel_f32->ny = mel_spec.n_mel;
+ mel_f32->buf = std::move(mel_spec.data);
+ size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
+
+ clip_image_f32_batch batch_f32;
+ batch_f32.is_audio = true;
+ batch_f32.entries.push_back(std::move(mel_f32));
+
+ mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
+ audio_tokens->n_tokens = n_tokens;
+ audio_tokens->batch_f32 = std::move(batch_f32);
+ audio_tokens->id = bitmap->id; // optional
+
+ LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
+
+ mtmd_input_chunk chunk{
+ MTMD_INPUT_CHUNK_TYPE_AUDIO,
+ {}, // text tokens
+ nullptr, // image tokens
+ std::move(audio_tokens),
+ };
+ cur.entries.emplace_back(std::move(chunk));
+ }
+
+ if (!ctx->aud_end.empty()) {
+ add_text(ctx->aud_end, true); // add audio end token
+ }
+ }
+
+ return 0;
+ }
+
+ std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
+ std::vector<mtmd_input_chunk> chunks;
+
+ for (auto & entry : batch_f32.entries) {
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+ image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
+ image_tokens->ny = 1;
+ image_tokens->batch_f32.entries.push_back(std::move(entry));
+ image_tokens->id = id;
+
+ mtmd_input_chunk chunk{
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
+ {}, // text tokens
+ std::move(image_tokens),
+ nullptr, // audio tokens
+ };
+ chunks.emplace_back(std::move(chunk));
+ }
+
+ return chunks;
+ }
+
+ // for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
+ static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
+ std::vector<std::string> result;
+ if (input.empty()) {
+ return result;
+ }
+ size_t start = 0;
+ size_t pos = 0;
+ while ((pos = input.find(delimiter, start)) != std::string::npos) {
+ if (pos > start) {
+ result.push_back(input.substr(start, pos - start));
+ }
+ result.push_back(delimiter);
+ start = pos + delimiter.length();
+ }
+ if (start < input.length()) {
+ result.push_back(input.substr(start));
+ }
+ return result;
+ }
+
+ // copied from common_tokenize
+ static std::vector<llama_token> mtmd_tokenize_text_internal(
+ const struct llama_vocab * vocab,
+ const std::string & text,
+ bool add_special,
+ bool parse_special) {
+ // upper limit for the number of tokens
+ int n_tokens = text.length() + 2 * add_special;
+ std::vector<llama_token> result(n_tokens);
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+ if (n_tokens < 0) {
+ result.resize(-n_tokens);
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+ GGML_ASSERT(check == -n_tokens);
+ } else {
+ result.resize(n_tokens);
+ }
+ return result;
+ }
+};
+
+int32_t mtmd_tokenize(mtmd_context * ctx,
+ mtmd_input_chunks * output,
+ const mtmd_input_text * text,
+ const mtmd_bitmap ** bitmaps,
+ size_t n_bitmaps) {
+ mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
+ return tokenizer.tokenize(output);
+}
+
+int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
+ return 0;
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ if (!ctx->ctx_v) {
+ LOG_ERR("%s: model does not support vision input\n", __func__);
+ return 1;
+ }
+ return mtmd_encode(ctx, chunk->tokens_image.get());
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ if (!ctx->ctx_a) {
+ LOG_ERR("%s: model does not support audio input\n", __func__);
+ return 1;
+ }
+ int n_mmproj_embd = ctx->n_embd_text;
+ ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
+ bool ok = clip_image_batch_encode(
+ ctx->ctx_a,
+ ctx->n_threads,
+ &chunk->tokens_audio->batch_f32,
+ ctx->image_embd_v.data());
+ return ok ? 0 : 1;
+ }
+
+ LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
+ return 1;
+}
+
+int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
+ clip_ctx * ctx_clip = ctx->ctx_v;
+ if (!ctx_clip) {
+ LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
+ return 1;
+ }
+ int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
+ ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
+ bool ok = false;
+
+ if (clip_is_llava(ctx_clip)
+ || clip_is_minicpmv(ctx_clip)
+ || clip_is_glm(ctx_clip)) {
+ // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
+ const auto & entries = image_tokens->batch_f32.entries;
+ for (size_t i = 0; i < entries.size(); i++) {
+ int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
+ ok = clip_image_encode(
+ ctx_clip,
+ ctx->n_threads,
+ entries[i].get(),
+ ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
+ }
+ } else {
+ ok = clip_image_batch_encode(
+ ctx_clip,
+ ctx->n_threads,
+ &image_tokens->batch_f32,
+ ctx->image_embd_v.data());
+ }
+
+ return ok ? 0 : 1;
+}
+
+float * mtmd_get_output_embd(mtmd_context * ctx) {
+ return ctx->image_embd_v.data();
+}
+
+bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
+ switch (ctx->proj_type_v()) {
+ case PROJECTOR_TYPE_GEMMA3:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool mtmd_decode_use_mrope(mtmd_context * ctx) {
+ switch (ctx->proj_type_v()) {
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_QWEN3VL:
+ case PROJECTOR_TYPE_GLM4V:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool mtmd_support_vision(mtmd_context * ctx) {
+ return ctx->ctx_v != nullptr;
+}
+
+bool mtmd_support_audio(mtmd_context * ctx) {
+ return ctx->ctx_a != nullptr;
+}
+
+int mtmd_get_audio_bitrate(mtmd_context * ctx) {
+ if (!ctx->ctx_a) {
+ return -1;
+ }
+ return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
+}
+
+//
+// public API functions
+//
+
+// mtmd_bitmap
+
+mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
+ uint32_t ny,
+ const unsigned char * data) {
+ mtmd_bitmap * bitmap = new mtmd_bitmap;
+ bitmap->nx = nx;
+ bitmap->ny = ny;
+ size_t data_size = (size_t)nx * ny * 3;
+ bitmap->data.resize(data_size);
+ std::memcpy(bitmap->data.data(), data, data_size);
+ return bitmap;
+}
+
+mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
+ const float * data) {
+ mtmd_bitmap * bitmap = new mtmd_bitmap;
+ bitmap->nx = n_samples;
+ bitmap->ny = 1;
+ bitmap->is_audio = true;
+ size_t data_size = n_samples * sizeof(float);
+ bitmap->data.resize(data_size);
+ std::memcpy(bitmap->data.data(), data, data_size);
+ return bitmap;
+}
+
+uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
+ return bitmap->nx;
+}
+
+uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
+ return bitmap->ny;
+}
+
+const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
+ return bitmap->data.data();
+}
+
+size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
+ return bitmap->data.size();
+}
+
+bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
+ return bitmap->is_audio;
+}
+
+const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
+ return bitmap->id.c_str();
+}
+
+void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
+ if (id) {
+ bitmap->id = std::string(id);
+ } else {
+ bitmap->id.clear();
+ }
+}
+
+void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
+ if (bitmap) {
+ delete bitmap;
+ }
+}
+
+// mtmd_input_chunks
+
+mtmd_input_chunks * mtmd_input_chunks_init() {
+ return new mtmd_input_chunks;
+}
+
+size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
+ return chunks->entries.size();
+}
+
+const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
+ if (idx >= chunks->entries.size()) {
+ return nullptr;
+ }
+ return &chunks->entries[idx];
+}
+
+void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
+ if (chunks) {
+ delete chunks;
+ }
+}
+
+// mtmd_input_chunk
+
+enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
+ return chunk->type;
+}
+
+const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ *n_tokens_output = chunk->tokens_text.size();
+ return chunk->tokens_text.data();
+ }
+ *n_tokens_output = 0;
+ return nullptr;
+}
+
+const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ return chunk->tokens_image.get();
+ }
+ return nullptr;
+}
+
+size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ return chunk->tokens_text.size();
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ return chunk->tokens_audio->n_tokens;
+ } else {
+ GGML_ABORT("invalid chunk type");
+ }
+}
+
+llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+ return chunk->tokens_text.size();
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ return chunk->tokens_audio->n_tokens;
+ } else {
+ GGML_ABORT("invalid chunk type");
+ }
+}
+
+const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+ return chunk->tokens_image->id.c_str();
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+ return chunk->tokens_audio->id.c_str();
+ }
+ return nullptr;
+}
+
+mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
+ mtmd_input_chunk * copy = new mtmd_input_chunk{
+ chunk->type,
+ chunk->tokens_text,
+ nullptr,
+ nullptr,
+ };
+ if (chunk->tokens_image) {
+ // copy the image tokens
+ copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
+ *copy->tokens_image = chunk->tokens_image->clone();
+ }
+ if (chunk->tokens_audio) {
+ // copy the audio tokens
+ copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
+ *copy->tokens_audio = chunk->tokens_audio->clone();
+ }
+ return copy;
+}
+
+void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
+ if (chunk) {
+ delete chunk;
+ }
+}
+
+// mtmd_image_tokens
+
+size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
+ return image_tokens->n_tokens();
+}
+
+size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
+ return image_tokens->nx;
+}
+
+size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
+ return image_tokens->ny;
+}
+
+const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
+ return image_tokens->id.c_str();
+}
+
+llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
+ if (image_tokens->use_mrope_pos) {
+ // for M-RoPE, temporal dimension = max(t,h,w)
+ // t is omitted as we don't support video input
+ return std::max(image_tokens->nx, image_tokens->ny);
+ }
+ return image_tokens->n_tokens();
+}
+
+// test function
+
+mtmd_input_chunks * mtmd_test_create_input_chunks() {
+ mtmd_input_chunks * chunks = mtmd_input_chunks_init();
+ if (!chunks) {
+ return nullptr;
+ }
+
+ // create a text chunk
+ std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
+ mtmd_input_chunk chunk_text{
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
+ std::move(tokens_text),
+ nullptr, // image tokens
+ nullptr, // audio tokens
+ };
+ chunks->entries.emplace_back(std::move(chunk_text));
+
+ // create an image chunk
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+ image_tokens->nx = 4;
+ image_tokens->ny = 4;
+ image_tokens->batch_f32.entries.resize(16);
+ image_tokens->id = "image_1";
+ mtmd_input_chunk chunk_image{
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
+ {}, // text tokens
+ std::move(image_tokens),
+ nullptr, // audio tokens
+ };
+ chunks->entries.emplace_back(std::move(chunk_image));
+
+ return chunks;
+}
+
+void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
+ g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
+ g_logger_state.log_callback_user_data = user_data;
+}
diff --git a/llama.cpp/tools/mtmd/mtmd.h b/llama.cpp/tools/mtmd/mtmd.h
new file mode 100644
index 0000000..ef25d32
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd.h
@@ -0,0 +1,319 @@
+#ifndef MTMD_H
+#define MTMD_H
+
+#include "ggml.h"
+#include "llama.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+#include <string>
+#include <vector>
+#include <cinttypes>
+#include <memory>
+#endif
+
+/**
+ * libmtmd: A library for multimodal support in llama.cpp.
+ *
+ * WARNING: This API is experimental and subject to many BREAKING CHANGES.
+ * Issues related to API usage may receive lower priority support.
+ *
+ * For the usage, see an example in mtmd-cli.cpp
+ *
+ * For contributors:
+ * - Make sure the C API is aligned with the libllama C API (as in llama.h)
+ * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
+ * - Keep the API minimal, do not expose internal details unless necessary
+ *
+ * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
+ * We encourage human contributors to ensure the quality and reliability of the codebase.
+ */
+
+#ifdef LLAMA_SHARED
+# if defined(_WIN32) && !defined(__MINGW32__)
+# ifdef LLAMA_BUILD
+# define MTMD_API __declspec(dllexport)
+# else
+# define MTMD_API __declspec(dllimport)
+# endif
+# else
+# define MTMD_API __attribute__ ((visibility ("default")))
+# endif
+#else
+# define MTMD_API
+#endif
+
+// deprecated marker, use mtmd_default_marker() instead
+#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum mtmd_input_chunk_type {
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
+ MTMD_INPUT_CHUNK_TYPE_AUDIO,
+};
+
+// opaque types
+struct mtmd_context;
+struct mtmd_bitmap;
+struct mtmd_image_tokens;
+struct mtmd_input_chunk;
+struct mtmd_input_chunks;
+
+struct mtmd_input_text {
+ const char * text;
+ bool add_special;
+ bool parse_special;
+};
+
+//
+// C API
+//
+
+typedef struct mtmd_context mtmd_context;
+typedef struct mtmd_bitmap mtmd_bitmap;
+typedef struct mtmd_image_tokens mtmd_image_tokens;
+typedef struct mtmd_input_chunk mtmd_input_chunk;
+typedef struct mtmd_input_chunks mtmd_input_chunks;
+typedef struct mtmd_input_text mtmd_input_text;
+
+struct mtmd_context_params {
+ bool use_gpu;
+ bool print_timings;
+ int n_threads;
+ const char * image_marker; // deprecated, use media_marker instead
+ const char * media_marker;
+ enum llama_flash_attn_type flash_attn_type;
+ bool warmup; // whether to run a warmup encode pass after initialization
+
+ // limit number of image tokens, only for vision models with dynamic resolution
+ int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
+ int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
+
+ // callback function passed over to mtmd proper
+ ggml_backend_sched_eval_callback cb_eval;
+ void * cb_eval_user_data;
+};
+
+MTMD_API const char * mtmd_default_marker(void);
+
+MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
+
+// initialize the mtmd context
+// return nullptr on failure
+MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+ const struct llama_model * text_model,
+ const struct mtmd_context_params ctx_params);
+
+MTMD_API void mtmd_free(mtmd_context * ctx);
+
+// whether we need to set non-causal mask before llama_decode
+MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+
+// whether the current model use M-RoPE for llama_decode
+MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+
+// whether the current model supports vision input
+MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+
+// whether the current model supports audio input
+MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
+
+// get audio bitrate in Hz, for example 16000 for Whisper
+// return -1 if audio is not supported
+MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
+
+// mtmd_bitmap
+//
+// if bitmap is image:
+// length of data must be nx * ny * 3
+// the data is in RGBRGBRGB... format
+// if bitmap is audio:
+// length of data must be n_samples * sizeof(float)
+// the data is in float format (PCM F32)
+MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
+MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
+MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
+MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
+MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
+MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
+MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
+MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
+// bitmap ID is optional, but useful for KV cache tracking
+// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
+MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
+MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
+
+
+// mtmd_input_chunks
+//
+// this is simply a list of mtmd_input_chunk
+// the elements can only be populated via mtmd_tokenize()
+MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
+MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
+MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
+MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+
+// mtmd_input_chunk
+//
+// the instance will be constructed via mtmd_tokenize()
+// it will be freed along with mtmd_input_chunks
+MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
+MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
+MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
+MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
+// returns nullptr for ID on text chunk
+MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
+MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
+
+// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
+// you can move the chunk ownership to your own code by copying it
+// remember to free the chunk when you are done with it
+MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
+MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
+
+
+// mtmd_image_tokens
+//
+// the instance will be constructed via mtmd_tokenize()
+// it will be freed along with mtmd_input_chunk
+MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
+MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
+MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
+MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
+
+// tokenize an input text prompt and a list of bitmaps (images/audio)
+// the prompt must have the input image marker (default: "<__media__>") in it
+// the default marker is defined by mtmd_default_marker()
+// the marker will be replaced with the image/audio chunk
+// for example:
+// "here is an image: <__media__>\ndescribe it in detail."
+// this will gives 3 chunks:
+// 1. "here is an image: <start_of_image>"
+// 2. (image/audio tokens)
+// 3. "<end_of_image>\ndescribe it in detail."
+// number of bitmaps must be equal to the number of markers in the prompt
+// this function is thread-safe (shared ctx)
+// return values:
+// 0 on success
+// 1 on number of bitmaps not matching the number of markers
+// 2 on image preprocessing error
+MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
+ mtmd_input_chunks * output,
+ const mtmd_input_text * text,
+ const mtmd_bitmap ** bitmaps,
+ size_t n_bitmaps);
+
+// returns 0 on success
+// TODO: deprecate
+MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
+ const mtmd_image_tokens * image_tokens);
+
+// returns 0 on success
+MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
+ const mtmd_input_chunk * chunk);
+
+// get output embeddings from the last encode pass
+// the reading size (in bytes) is equal to:
+// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
+MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
+
+// Set callback for all future logging events.
+// If this is not called, or NULL is supplied, everything is output on stderr.
+MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
+
+/////////////////////////////////////////
+
+// test function, to be used in test-mtmd-c-api.c
+MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+//
+// C++ wrappers
+//
+
+#ifdef __cplusplus
+
+namespace mtmd {
+
+struct mtmd_context_deleter {
+ void operator()(mtmd_context * val) { mtmd_free(val); }
+};
+using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
+
+struct mtmd_bitmap_deleter {
+ void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
+};
+using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
+
+struct mtmd_input_chunks_deleter {
+ void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
+};
+using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
+
+struct mtmd_input_chunk_deleter {
+ void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
+};
+using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
+
+struct bitmap {
+ bitmap_ptr ptr;
+ bitmap() : ptr(nullptr) {}
+ bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
+ bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
+ bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
+ ptr.reset(mtmd_bitmap_init(nx, ny, data));
+ }
+ ~bitmap() = default;
+ uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
+ uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
+ const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
+ size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
+ std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
+ void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
+};
+
+struct bitmaps {
+ std::vector<bitmap> entries;
+ ~bitmaps() = default;
+ // return list of pointers to mtmd_bitmap
+ // example:
+ // auto bitmaps_c_ptr = bitmaps.c_ptr();
+ // int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
+ std::vector<const mtmd_bitmap *> c_ptr() {
+ std::vector<const mtmd_bitmap *> res(entries.size());
+ for (size_t i = 0; i < entries.size(); i++) {
+ res[i] = entries[i].ptr.get();
+ }
+ return res;
+ }
+};
+
+struct input_chunks {
+ input_chunks_ptr ptr;
+ input_chunks() = default;
+ input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
+ ~input_chunks() = default;
+ size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
+ const mtmd_input_chunk * operator[](size_t idx) const {
+ return mtmd_input_chunks_get(ptr.get(), idx);
+ }
+};
+
+} // namespace mtmd
+
+#endif
+
+#endif
diff --git a/llama.cpp/tools/mtmd/requirements.txt b/llama.cpp/tools/mtmd/requirements.txt
new file mode 100644
index 0000000..0a1f4e8
--- /dev/null
+++ b/llama.cpp/tools/mtmd/requirements.txt
@@ -0,0 +1,5 @@
+-r ../../requirements/requirements-convert_legacy_llama.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
+pillow~=11.3.0
+torch~=2.6.0
+torchvision~=0.21.0
diff --git a/llama.cpp/tools/mtmd/test-1.jpeg b/llama.cpp/tools/mtmd/test-1.jpeg
new file mode 100644
index 0000000..7fdcaaf
--- /dev/null
+++ b/llama.cpp/tools/mtmd/test-1.jpeg
Binary files differ
diff --git a/llama.cpp/tools/mtmd/test-2.mp3 b/llama.cpp/tools/mtmd/test-2.mp3
new file mode 100644
index 0000000..aa9d7ec
--- /dev/null
+++ b/llama.cpp/tools/mtmd/test-2.mp3
Binary files differ
diff --git a/llama.cpp/tools/mtmd/tests.sh b/llama.cpp/tools/mtmd/tests.sh
new file mode 100755
index 0000000..012958e
--- /dev/null
+++ b/llama.cpp/tools/mtmd/tests.sh
@@ -0,0 +1,183 @@
+#!/usr/bin/env bash
+
+# make sure we are in the right directory
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
+#export LLAMA_CACHE="$SCRIPT_DIR/tmp"
+
+set -eux
+
+mkdir -p $SCRIPT_DIR/output
+
+PROJ_ROOT="$SCRIPT_DIR/../.."
+cd $PROJ_ROOT
+
+# Check if the first argument is "big", then run test with big models
+# This is useful if we're running the script on a larger machine, so we can test the big models
+RUN_BIG_TESTS=false
+if [ "${1:-}" = "big" ]; then
+ RUN_BIG_TESTS=true
+ echo "Include BIG models..."
+fi
+
+RUN_HUGE_TESTS=false
+if [ "${1:-}" = "huge" ]; then
+ RUN_HUGE_TESTS=true
+ RUN_BIG_TESTS=true
+ echo "Include BIG and HUGE models..."
+fi
+
+###############
+
+arr_prefix=()
+arr_hf=()
+arr_extra_args=()
+arr_file=()
+
+add_test_vision() {
+ local hf=$1
+ shift
+ local extra_args=""
+ if [ $# -gt 0 ]; then
+ extra_args=$(printf " %q" "$@")
+ fi
+ arr_prefix+=("[vision]")
+ arr_hf+=("$hf")
+ arr_extra_args+=("$extra_args")
+ arr_file+=("test-1.jpeg")
+}
+
+add_test_audio() {
+ local hf=$1
+ shift
+ local extra_args=""
+ if [ $# -gt 0 ]; then
+ extra_args=$(printf " %q" "$@")
+ fi
+ arr_prefix+=("[audio] ")
+ arr_hf+=("$hf")
+ arr_extra_args+=("$extra_args")
+ arr_file+=("test-2.mp3")
+}
+
+add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
+add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
+add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
+add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
+add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
+add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
+add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
+add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
+add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
+add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
+add_test_vision "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
+add_test_vision "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
+add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
+add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
+add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
+add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
+add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
+add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0"
+add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
+
+add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
+add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
+add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
+add_test_audio "ggml-org/LFM2-Audio-1.5B-GGUF:Q8_0"
+
+# to test the big models, run: ./tests.sh big
+if [ "$RUN_BIG_TESTS" = true ]; then
+ add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
+ add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
+ add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
+ add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
+ add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
+ add_test_vision "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
+ add_test_vision "ggml-org/Qwen3-VL-2B-Instruct-GGUF:Q8_0"
+ add_test_vision "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
+ add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
+ add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
+ # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
+ # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
+
+ add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
+ add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
+fi
+
+# to test the huge models, run: ./tests.sh huge
+# this will run both the big and huge models
+# huge models are > 32B parameters
+if [ "$RUN_HUGE_TESTS" = true ]; then
+ add_test_vision "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
+ add_test_vision "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
+fi
+
+# these models always give the wrong answer, not sure why
+# add_test_vision "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
+# add_test_vision "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
+# add_test_vision "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
+
+# this model has broken chat template, not usable
+# add_test_vision "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
+# add_test_vision "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
+
+###############
+
+cmake --build build -j --target llama-mtmd-cli
+
+arr_res=()
+
+for i in "${!arr_hf[@]}"; do
+ bin="llama-mtmd-cli"
+ prefix="${arr_prefix[$i]}"
+ hf="${arr_hf[$i]}"
+ extra_args="${arr_extra_args[$i]}"
+ inp_file="${arr_file[$i]}"
+
+ echo "Running test with binary: $bin and HF model: $hf"
+ echo ""
+ echo ""
+
+ cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
+ -hf $(printf %q "$hf") \
+ --image $(printf %q "$SCRIPT_DIR/$inp_file") \
+ --temp 0 -n 128 \
+ ${extra_args}"
+
+ # if extra_args does not contain -p, we add a default prompt
+ if ! [[ "$extra_args" =~ "-p" ]]; then
+ cmd+=" -p \"what is the publisher name of the newspaper?\""
+ fi
+
+ output=$(eval "$cmd" 2>&1 | tee /dev/tty)
+
+ echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
+
+ # either contains "new york" or both "men" and "walk"
+ if echo "$output" | grep -iq "new york" \
+ || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
+ then
+ result="$prefix \033[32mOK\033[0m: $hf"
+ else
+ result="$prefix \033[31mFAIL\033[0m: $hf"
+ fi
+ echo -e "$result"
+ arr_res+=("$result")
+
+ echo ""
+ echo ""
+ echo ""
+ echo "#################################################"
+ echo "#################################################"
+ echo ""
+ echo ""
+done
+
+set +x
+
+for i in "${!arr_res[@]}"; do
+ echo -e "${arr_res[$i]}"
+done
+echo ""
+echo "Output logs are saved in $SCRIPT_DIR/output"