diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/tools/mtmd | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/tools/mtmd')
43 files changed, 13569 insertions, 0 deletions
diff --git a/llama.cpp/tools/mtmd/CMakeLists.txt b/llama.cpp/tools/mtmd/CMakeLists.txt new file mode 100644 index 0000000..02d71f2 --- /dev/null +++ b/llama.cpp/tools/mtmd/CMakeLists.txt @@ -0,0 +1,96 @@ +# mtmd + +find_package(Threads REQUIRED) + +add_library(mtmd + mtmd.cpp + mtmd-audio.cpp + mtmd.h + mtmd-helper.cpp + mtmd-helper.h + clip.cpp + clip.h + clip-impl.h + clip-model.h + clip-graph.h + models/models.h + models/cogvlm.cpp + models/conformer.cpp + models/glm4v.cpp + models/internvl.cpp + models/kimivl.cpp + models/kimik25.cpp + models/llama4.cpp + models/llava.cpp + models/minicpmv.cpp + models/pixtral.cpp + models/qwen2vl.cpp + models/qwen3vl.cpp + models/siglip.cpp + models/whisper-enc.cpp + models/mobilenetv5.cpp + models/youtuvl.cpp + ) + +set_target_properties(mtmd PROPERTIES + VERSION ${LLAMA_INSTALL_VERSION} + SOVERSION 0 + MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number +) + +target_link_libraries (mtmd PUBLIC ggml llama) +target_link_libraries (mtmd PRIVATE Threads::Threads) +target_include_directories(mtmd PUBLIC .) +target_include_directories(mtmd PRIVATE ../..) +target_include_directories(mtmd PRIVATE ../../vendor) +target_compile_features (mtmd PRIVATE cxx_std_17) + +if (BUILD_SHARED_LIBS) + set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(mtmd PRIVATE LLAMA_BUILD) + target_compile_definitions(mtmd PUBLIC LLAMA_SHARED) +endif() + +set(MTMD_PUBLIC_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h + ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h + ) + +set_target_properties(mtmd + PROPERTIES + PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}") + +install(TARGETS mtmd LIBRARY PUBLIC_HEADER) + +if (NOT MSVC) + # for stb_image.h and miniaudio.h + target_compile_options(mtmd PRIVATE -Wno-cast-qual) +endif() + +if (TARGET BUILD_INFO) + add_dependencies(mtmd BUILD_INFO) + add_dependencies(mtmd-helper BUILD_INFO) +endif() + +# if mtmd is linked against common, we throw an error +if (TARGET mtmd) + get_target_property(libs mtmd LINK_LIBRARIES) + if (libs AND "common" IN_LIST libs) + message(FATAL_ERROR "mtmd is designed to be a public library.\n" + "It must not link against common") + endif() +endif() + +add_executable(llama-llava-cli deprecation-warning.cpp) +add_executable(llama-gemma3-cli deprecation-warning.cpp) +add_executable(llama-minicpmv-cli deprecation-warning.cpp) +add_executable(llama-qwen2vl-cli deprecation-warning.cpp) + +set(TARGET llama-mtmd-cli) +add_executable (${TARGET} mtmd-cli.cpp) +set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() +target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/llama.cpp/tools/mtmd/README.md b/llama.cpp/tools/mtmd/README.md new file mode 100644 index 0000000..ef31d19 --- /dev/null +++ b/llama.cpp/tools/mtmd/README.md @@ -0,0 +1,63 @@ +# Multimodal Support in llama.cpp + +This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported. + +> [!IMPORTANT] +> +> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**. + +The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify: + +- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction. +- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure. +- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users. +- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs. +- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`. + +## Pre-quantized models + +See the list of pre-quantized model [here](../../docs/multimodal.md) + +## How it works and what is `mmproj`? + +Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model. + +This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging. + +Consequently, running a multimodal model typically requires two GGUF files: +1. The standard language model file. +2. A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection. + +## What is `libmtmd`? + +As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs. + +Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages: +- **Unified Interface:** Aims to consolidate interaction for various multimodal models. +- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library. +- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models. + +## How to obtain `mmproj` + +Multimodal projector (`mmproj`) files are specific to each model architecture. + +For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` flag to get the `mmproj` file: +- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ; See the guide [here](../../docs/multimodal/gemma3.md) - Note: 1B variant does not have vision support +- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)) +- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)) +- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint +- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen)) +- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503) +- InternVL 2.5 and InternVL 3 from [OpenGVLab](https://huggingface.co/OpenGVLab) (note: we don't support conversion of `InternVL3-*-hf` model, only non-HF version is supported ; `InternLM2Model` **text** model is not supported) + +For older models, please refer to the relevant guide for instructions on how to obtain or create them: + +NOTE: conversion scripts are located under `tools/mtmd/legacy-models` + +- [LLaVA](../../docs/multimodal/llava.md) +- [MobileVLM](../../docs/multimodal/MobileVLM.md) +- [GLM-Edge](../../docs/multimodal/glmedge.md) +- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md) +- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md) +- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md) +- [IBM Granite Vision](../../docs/multimodal/granitevision.md) diff --git a/llama.cpp/tools/mtmd/clip-graph.h b/llama.cpp/tools/mtmd/clip-graph.h new file mode 100644 index 0000000..4c7f750 --- /dev/null +++ b/llama.cpp/tools/mtmd/clip-graph.h @@ -0,0 +1,117 @@ +#pragma once + +#include "ggml.h" +#include "ggml-cpp.h" +#include "clip.h" +#include "clip-impl.h" +#include "clip-model.h" + +#include <vector> +#include <functional> + +#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS) + +struct clip_graph { + const clip_model & model; + const clip_hparams & hparams; + projector_type proj_type; + + // we only support single image per batch + const clip_image_f32 & img; + + const int patch_size; + const int n_patches_x; + const int n_patches_y; + const int n_patches; + const int n_embd; + const int n_head; + const int d_head; + const int n_layer; + const int n_mmproj_embd; + const float eps; + const float kq_scale; + const clip_flash_attn_type flash_attn_type; + + ggml_context_ptr ctx0_ptr; + ggml_context * ctx0; + ggml_cgraph * gf; + + clip_graph(clip_ctx * ctx, const clip_image_f32 & img); + + virtual ~clip_graph() = default; + virtual ggml_cgraph * build() = 0; + + // + // utility functions + // + void cb(ggml_tensor * cur0, const char * name, int il) const; + + // siglip2 naflex + ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE); + + // build vision transformer (ViT) cgraph + // this function should cover most of the models + // if your model has specific features, you should probably duplicate this function + ggml_tensor * build_vit( + ggml_tensor * inp, + int64_t n_pos, + norm_type norm_t, + ffn_op_type ffn_t, + ggml_tensor * learned_pos_embd, + std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos); + + // build the input after conv2d (inp_raw --> patches) + // returns tensor with shape [n_embd, n_patches] + ggml_tensor * build_inp(); + + ggml_tensor * build_inp_raw(int channels = 3); + + ggml_tensor * build_norm( + ggml_tensor * cur, + ggml_tensor * mw, + ggml_tensor * mb, + norm_type type, + float norm_eps, + int il) const; + + ggml_tensor * build_ffn( + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * down, + ggml_tensor * down_b, + ffn_op_type type_op, + int il) const; + + ggml_tensor * build_attn( + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale, + int il) const; + + // implementation of the 2D RoPE without adding a new op in ggml + // this is not efficient (use double the memory), but works on all backends + // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 + ggml_tensor * build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_a, // first half + ggml_tensor * pos_b, // second half + const float freq_base, + const bool interleave_freq + ); + + // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) + // support dynamic resolution + ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor); + + // Generic function to stack frames for audio processing + // Abstracts out the StackAudioFrames logic used by ultravox + ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed); +}; diff --git a/llama.cpp/tools/mtmd/clip-impl.h b/llama.cpp/tools/mtmd/clip-impl.h new file mode 100644 index 0000000..3bc93ea --- /dev/null +++ b/llama.cpp/tools/mtmd/clip-impl.h @@ -0,0 +1,582 @@ +#pragma once + +#include "ggml.h" +#include "gguf.h" +#include "clip.h" + +#include <climits> +#include <cstdarg> +#include <cinttypes> +#include <string> +#include <map> +#include <sstream> +#include <vector> +#include <memory> + +// Internal header for clip.cpp + +#define MTMD_INTERNAL_HEADER + +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +#define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_HAS_AUDIO_ENC "clip.has_audio_encoder" +#define KEY_HAS_VISION_ENC "clip.has_vision_encoder" +#define KEY_USE_GELU "clip.use_gelu" +#define KEY_USE_SILU "clip.use_silu" + +#define KEY_N_EMBD "clip.%s.embedding_length" +#define KEY_N_FF "clip.%s.feed_forward_length" +#define KEY_N_BLOCK "clip.%s.block_count" +#define KEY_PROJ_DIM "clip.%s.projection_dim" +#define KEY_N_HEAD "clip.%s.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" + +// vision-specific +#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" +#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" +#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_FEATURE_LAYER "clip.vision.feature_layer" +#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" +#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" +#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers" + +#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" +#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" +#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" +#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" +#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes" +#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" +#define KEY_MINICPMV_VERSION "clip.minicpmv_version" +#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" + +// audio-specific +#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities +#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" +#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor" + + +// +// tensor name constants +// + +#define TN_POS_EMBD "%s.position_embd.weight" +#define TN_CLASS_EMBD "v.class_embd" +#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat +#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" +#define TN_PATCH_BIAS "v.patch_embd.bias" +#define TN_NORM_EMBD "v.norm_embd.%s" +#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s" +#define TN_ATTN_K "%s.blk.%d.attn_k.%s" +#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" +#define TN_ATTN_V "%s.blk.%d.attn_v.%s" +#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" +#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s" +#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" +#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" +#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" +#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" +#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" +#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm +#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm +#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale +#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale +#define TN_LN_PRE "%s.pre_ln.%s" +#define TN_LN_POST "%s.post_ln.%s" +#define TN_LLAVA_PROJ "mm.%d.%s" +#define TN_MM_UP "mm.up.%s" +#define TN_MM_GATE "mm.gate.%s" +#define TN_MM_DOWN "mm.down.%s" +#define TN_MM_POST_NORM "mm.post_norm.%s" +#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" +#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" +#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" +#define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_MM_INP_NORM "mm.input_norm.weight" +#define TN_MM_INP_NORM_B "mm.input_norm.bias" +#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 +#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 +#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 +#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v +#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral +#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model) +#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model) +#define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack +#define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack +#define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack + +// mimicpmv +#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" +#define TN_MINICPMV_QUERY "resampler.query" +#define TN_MINICPMV_PROJ "resampler.proj.weight" +#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" +#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" +#define TN_MINICPMV_LN "resampler.ln_%s.%s" + +#define TN_GLM_ADAPER_CONV "adapter.conv.%s" +#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s" +#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s" +#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" +#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" +#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" + +// ultravox +#define TN_CONV1D "a.conv1d.%d.%s" +#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s" +#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer +#define TN_MM_NORM_PRE "mm.a.norm_pre.%s" +#define TN_MM_NORM_MID "mm.a.norm_mid.%s" + +// cogvlm +#define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s" +#define TN_MM_H_TO_4H "mm.up.%s" +#define TN_MM_GATE "mm.gate.%s" +#define TN_MM_4H_TO_H "mm.down.%s" +#define TN_TOK_BOI "v.boi" +#define TN_TOK_EOI "v.eoi" + +// (conformer) lfm2 +#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s" +#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" +#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s" +#define TN_FFN_UP_1 "%s.blk.%d.ffn_up_1.%s" +#define TN_FFN_DOWN_1 "%s.blk.%d.ffn_down_1.%s" +#define TN_POS_BIAS_U "%s.blk.%d.pos_bias_u" +#define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v" +#define TN_NORM_CONV "%s.blk.%d.norm_conv.%s" +#define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s" +#define TN_CONV_DW "%s.blk.%d.conv_dw.%s" +#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s" +#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s" +#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s" + +// mobilenetv5 (gemma3n) definitions +#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight" +#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias" +#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight" + +// Stage 0 Block (Edge Residual) +#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight" +#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight" +#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight" +#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight" + +// Stage 1+ Block (Universal Inverted Residual) +#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight" +#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight" +#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight" +#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight" +#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight" +#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight" +#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight" +#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight" +#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma" + +// Attention Components +#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight" +#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight" +#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight" +#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight" +#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight" +#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight" +#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight" +#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight" +#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks + +// MSFA +#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight" +#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight" +#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight" +#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight" +#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight" + + +// align x to upper multiple of n +#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) + +// forward declaration +// TODO: improve this later +struct clip_ctx; + +enum projector_type { + PROJECTOR_TYPE_MLP, + PROJECTOR_TYPE_MLP_NORM, + PROJECTOR_TYPE_LDP, + PROJECTOR_TYPE_LDPV2, + PROJECTOR_TYPE_MINICPMV, + PROJECTOR_TYPE_GLM_EDGE, + PROJECTOR_TYPE_QWEN2VL, + PROJECTOR_TYPE_QWEN3VL, + PROJECTOR_TYPE_GEMMA3, + PROJECTOR_TYPE_GEMMA3NV, + PROJECTOR_TYPE_GEMMA3NA, + PROJECTOR_TYPE_IDEFICS3, + PROJECTOR_TYPE_PIXTRAL, + PROJECTOR_TYPE_QWEN25VL, + PROJECTOR_TYPE_ULTRAVOX, + PROJECTOR_TYPE_INTERNVL, + PROJECTOR_TYPE_LLAMA4, + PROJECTOR_TYPE_QWEN2A, + PROJECTOR_TYPE_GLMA, + PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx + PROJECTOR_TYPE_VOXTRAL, + PROJECTOR_TYPE_MUSIC_FLAMINGO, + PROJECTOR_TYPE_LFM2, + PROJECTOR_TYPE_KIMIVL, + PROJECTOR_TYPE_LIGHTONOCR, + PROJECTOR_TYPE_COGVLM, + PROJECTOR_TYPE_JANUS_PRO, + PROJECTOR_TYPE_LFM2A, + PROJECTOR_TYPE_GLM4V, + PROJECTOR_TYPE_YOUTUVL, + PROJECTOR_TYPE_KIMIK25, + PROJECTOR_TYPE_UNKNOWN, +}; + +static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = { + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_LDPV2, "ldpv2"}, + { PROJECTOR_TYPE_MINICPMV, "resampler"}, + { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, + { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, + { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, + { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, + { PROJECTOR_TYPE_GEMMA3, "gemma3"}, + { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"}, + { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"}, + { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, + { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, + { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, + { PROJECTOR_TYPE_INTERNVL, "internvl"}, + { PROJECTOR_TYPE_LLAMA4, "llama4"}, + { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, + { PROJECTOR_TYPE_GLMA, "glma"}, + { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, + { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, + { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"}, + { PROJECTOR_TYPE_LFM2, "lfm2"}, + { PROJECTOR_TYPE_KIMIVL, "kimivl"}, + { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, + { PROJECTOR_TYPE_COGVLM, "cogvlm"}, + { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, + { PROJECTOR_TYPE_LFM2A, "lfm2a"}, + { PROJECTOR_TYPE_GLM4V, "glm4v"}, + { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, + { PROJECTOR_TYPE_KIMIK25, "kimik25"}, +}; + +static projector_type clip_projector_type_from_string(const std::string & str) { + for (const auto & pair : PROJECTOR_TYPE_NAMES) { + if (pair.second == str) { + return pair.first; + } + } + return PROJECTOR_TYPE_UNKNOWN; +} + +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector<uint8_t> buf; +}; + +// For images, buf.size() == nx*ny*3 +// Memory layout: RGBRGBRGB... +// For audio, only one channel is used, buf.size() == nx*ny +// nx will be n_frames and ny will be n_mel +struct clip_image_f32 { + int nx; + int ny; + + std::vector<float> buf; +}; + +// +// logging +// + +static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + +struct clip_logger_state { + ggml_log_callback log_callback; + void * log_callback_user_data; +}; + +extern struct clip_logger_state g_logger_state; + +static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) { + if (format == NULL) { + return; + } + va_list args_copy; + va_copy(args_copy, args); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data); + } else { + char * buffer2 = (char *) calloc(len + 1, sizeof(char)); + vsnprintf(buffer2, len + 1, format, args_copy); + buffer2[len] = 0; + g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data); + free(buffer2); + } + va_end(args_copy); +} + +static void clip_log_internal(enum ggml_log_level level, const char * format, ...) { + va_list args; + va_start(args, format); + clip_log_internal_v(level, format, args); + va_end(args); +} + +#define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define LOG_ERR(...) clip_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) +#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) +#define LOG_CNT(...) clip_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__) + +// +// cpp wrappers +// + +// wrapper for clip_image_size +struct clip_image_size_deleter { + void operator()(clip_image_size * val) { clip_image_size_free(val); } +}; +typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr; + +// wrapper for clip_image_u8 +struct clip_image_u8_deleter { + void operator()(clip_image_u8 * val) { clip_image_u8_free(val); } +}; +typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr; + +// wrapper for clip_image_f32 +struct clip_image_f32_deleter { + void operator()(clip_image_f32 * val) { clip_image_f32_free(val); } +}; +typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr; + +struct clip_image_u8_batch { + std::vector<clip_image_u8_ptr> entries; +}; + +struct clip_image_f32_batch { + std::vector<clip_image_f32_ptr> entries; + bool is_audio = false; + + // for llava-uhd style models, we need to know the grid size + // note: entries.size() == grid_x * grid_y + 1 (one overview image) + int grid_x = 0; + int grid_y = 0; + + clip_image_f32_batch clone() const { + clip_image_f32_batch new_batch{ + /* entries */ {}, + /* is_audio */ is_audio, + /* grid_x */ grid_x, + /* grid_y */ grid_y, + }; + new_batch.entries.reserve(entries.size()); + for (const auto & entry : entries) { + new_batch.entries.emplace_back(new clip_image_f32(*entry)); + } + return new_batch; + } +}; + +// +// common utils +// + +static std::string string_format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector<char> buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), buf.size()); +} + +static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; + } + std::string builder; + builder.reserve(s.length()); + size_t pos = 0; + size_t last_pos = 0; + while ((pos = s.find(search, last_pos)) != std::string::npos) { + builder.append(s, last_pos, pos - last_pos); + builder.append(replace); + last_pos = pos + search.length(); + } + builder.append(s, last_pos, std::string::npos); + s = std::move(builder); +} + +// split string by a `std::string delim` instead of `char delim` +static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) { + std::vector<std::string> tokens; + size_t pos = 0; + std::string token; + while ((pos = s.find(delimiter)) != std::string::npos) { + token = s.substr(0, pos); + tokens.push_back(token); + s.erase(0, pos + delimiter.length()); + } + tokens.push_back(s); + return tokens; +} + +// +// gguf utils +// + +static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { + switch (type) { + case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); + case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); + case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); + case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); + case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); + case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); + case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); + case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); + case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); + case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); + case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; + default: return string_format("unknown type %d", type); + } +} + +static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { + const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + + switch (type) { + case GGUF_TYPE_STRING: + return gguf_get_val_str(ctx_gguf, i); + case GGUF_TYPE_ARRAY: + { + const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); + int arr_n = gguf_get_arr_n(ctx_gguf, i); + const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i); + std::stringstream ss; + ss << "["; + for (int j = 0; j < arr_n; j++) { + if (arr_type == GGUF_TYPE_STRING) { + std::string val = gguf_get_arr_str(ctx_gguf, i, j); + // escape quotes + string_replace_all(val, "\\", "\\\\"); + string_replace_all(val, "\"", "\\\""); + ss << '"' << val << '"'; + } else if (arr_type == GGUF_TYPE_ARRAY) { + ss << "???"; + } else { + ss << gguf_data_to_str(arr_type, data, j); + } + if (j < arr_n - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); + } + default: + return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); + } +} + +// +// debugging +// + +static void print_tensor_shape(ggml_tensor * t) { + printf("%s.shape = [", t->name); + for (int i = 0; i < ggml_n_dims(t); ++i) { + printf("%" PRId64, t->ne[i]); + if (i < ggml_n_dims(t) - 1) { + printf(", "); + } + } + printf("]\n"); +} + +static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) { + ggml_type type = t->type; + int64_t * ne = t->ne; + size_t * nb = t->nb; + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + printf("%s.data: [\n", t->name); + for (int64_t i2 = 0; i2 < ne[2]; i2++) { + if (i2 == n && ne[2] > 2*n) { + printf(" ..., \n"); + i2 = ne[2] - n; + } + printf(" [\n"); + for (int64_t i1 = 0; i1 < ne[1]; i1++) { + if (i1 == n && ne[1] > 2*n) { + printf(" ..., \n"); + i1 = ne[1] - n; + } + printf(" ["); + for (int64_t i0 = 0; i0 < ne[0]; i0++) { + if (i0 == n && ne[0] > 2*n) { + printf("..., "); + i0 = ne[0] - n; + } + size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; + float v; + if (type == GGML_TYPE_F16) { + v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); + } else if (type == GGML_TYPE_F32) { + v = *(float *) &data[i]; + } else if (type == GGML_TYPE_I32) { + v = (float) *(int32_t *) &data[i]; + } else if (type == GGML_TYPE_I16) { + v = (float) *(int16_t *) &data[i]; + } else if (type == GGML_TYPE_I8) { + v = (float) *(int8_t *) &data[i]; + } else { + GGML_ABORT("fatal error"); + } + printf("%8.4f", v); + if (i0 < ne[0] - 1) printf(", "); + } + printf("],\n"); + } + printf(" ],\n"); + } + printf(" ]\n"); + } +} + +void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value); + +// +// API used internally with mtmd +// + +projector_type clip_get_projector_type(const struct clip_ctx * ctx); diff --git a/llama.cpp/tools/mtmd/clip-model.h b/llama.cpp/tools/mtmd/clip-model.h new file mode 100644 index 0000000..d4ff915 --- /dev/null +++ b/llama.cpp/tools/mtmd/clip-model.h @@ -0,0 +1,389 @@ +#pragma once + +#include "ggml.h" +#include "clip.h" +#include "clip-impl.h" + +#include <array> +#include <vector> +#include <unordered_set> +#include <cstdint> +#include <cmath> + +enum ffn_op_type { + FFN_GELU, + FFN_GELU_ERF, + FFN_SILU, + FFN_GELU_QUICK, +}; + +enum norm_type { + NORM_TYPE_NORMAL, + NORM_TYPE_RMS, +}; + +enum patch_merge_type { + PATCH_MERGE_FLAT, + PATCH_MERGE_SPATIAL_UNPAD, +}; + +struct clip_hparams { + int32_t image_size = 0; + int32_t patch_size = 0; + int32_t n_embd = 0; + int32_t n_ff = 0; + int32_t projection_dim = 0; + int32_t n_head = 0; + int32_t n_layer = 0; + // idefics3 + int32_t image_longest_edge = 0; + int32_t image_min_pixels = -1; + int32_t image_max_pixels = -1; + int32_t n_merge = 0; // number of patch merges **per-side** + + float image_mean[3]; + float image_std[3]; + + // for models using dynamic image size, we need to have a smaller image size to warmup + // otherwise, user will get OOM everytime they load the model + int32_t warmup_image_size = 0; + int32_t warmup_audio_size = 3000; + + ffn_op_type ffn_op = FFN_GELU; + + patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; + + float eps = 1e-6; + float rope_theta = 0.0; + + std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models + int32_t image_crop_resolution; + std::unordered_set<int32_t> vision_feature_layer; + int32_t attn_window_size = 0; + int32_t n_wa_pattern = 0; + std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL) + + // audio + int32_t n_mel_bins = 0; // whisper preprocessor + int32_t proj_stack_factor = 0; // ultravox + + // audio-to-mel preprocessor params + int32_t audio_chunk_len = -1; // in seconds + int32_t audio_sample_rate = -1; + int32_t audio_n_fft = -1; + int32_t audio_window_len = -1; + int32_t audio_hop_len = -1; + + // legacy + bool has_llava_projector = false; + int minicpmv_version = 0; + int32_t minicpmv_query_num = 0; // MiniCPM-V query number + + // custom value provided by user, can be undefined if not set + int32_t custom_image_min_tokens = -1; + int32_t custom_image_max_tokens = -1; + + void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { + const int cur_merge = n_merge == 0 ? 1 : n_merge; + const int patch_area = patch_size * patch_size * cur_merge * cur_merge; + image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area; + image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area; + warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels)); + } + + void set_warmup_n_tokens(int n_tokens) { + int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens)); + GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); + const int cur_merge = n_merge == 0 ? 1 : n_merge; + warmup_image_size = n_tok_per_side * patch_size * cur_merge; + // TODO: support warmup size for custom token numbers + } +}; + +struct clip_layer { + // attention + ggml_tensor * k_w = nullptr; + ggml_tensor * k_b = nullptr; + ggml_tensor * q_w = nullptr; + ggml_tensor * q_b = nullptr; + ggml_tensor * v_w = nullptr; + ggml_tensor * v_b = nullptr; + ggml_tensor * qkv_w = nullptr; + ggml_tensor * qkv_b = nullptr; + + ggml_tensor * o_w = nullptr; + ggml_tensor * o_b = nullptr; + + ggml_tensor * k_norm = nullptr; + ggml_tensor * q_norm = nullptr; + + // layernorm 1 + ggml_tensor * ln_1_w = nullptr; + ggml_tensor * ln_1_b = nullptr; + + ggml_tensor * ff_up_w = nullptr; + ggml_tensor * ff_up_b = nullptr; + ggml_tensor * ff_gate_w = nullptr; + ggml_tensor * ff_gate_b = nullptr; + ggml_tensor * ff_down_w = nullptr; + ggml_tensor * ff_down_b = nullptr; + + // layernorm 2 + ggml_tensor * ln_2_w = nullptr; + ggml_tensor * ln_2_b = nullptr; + + // layer scale (no bias) + ggml_tensor * ls_1_w = nullptr; + ggml_tensor * ls_2_w = nullptr; + + // qwen3vl deepstack merger + ggml_tensor * deepstack_norm_w = nullptr; + ggml_tensor * deepstack_norm_b = nullptr; + ggml_tensor * deepstack_fc1_w = nullptr; + ggml_tensor * deepstack_fc1_b = nullptr; + ggml_tensor * deepstack_fc2_w = nullptr; + ggml_tensor * deepstack_fc2_b = nullptr; + + // lfm2 + ggml_tensor * ff_norm_w = nullptr; + ggml_tensor * ff_norm_b = nullptr; + ggml_tensor * ff_norm_1_w = nullptr; + ggml_tensor * ff_norm_1_b = nullptr; + ggml_tensor * ff_up_1_w = nullptr; + ggml_tensor * ff_up_1_b = nullptr; + ggml_tensor * ff_down_1_w = nullptr; + ggml_tensor * ff_down_1_b = nullptr; + ggml_tensor * pos_bias_u = nullptr; + ggml_tensor * pos_bias_v = nullptr; + ggml_tensor * norm_conv_w = nullptr; + ggml_tensor * norm_conv_b = nullptr; + ggml_tensor * linear_pos_w = nullptr; + + ggml_tensor * conv_norm_w = nullptr; + ggml_tensor * conv_norm_b = nullptr; + ggml_tensor * conv_dw_w = nullptr; + ggml_tensor * conv_dw_b = nullptr; + ggml_tensor * conv_pw1_w = nullptr; + ggml_tensor * conv_pw1_b = nullptr; + ggml_tensor * conv_pw2_w = nullptr; + ggml_tensor * conv_pw2_b = nullptr; + + bool has_deepstack() const { + return deepstack_fc1_w != nullptr; + } +}; + +// Expanded MobileNetV5 block structure for Gemma3n vision encoder +struct mobilenetv5_block { + // Stage 0 (Edge Residual) + ggml_tensor * s0_conv_exp_w = nullptr; + ggml_tensor * s0_bn1_w = nullptr; + ggml_tensor * s0_conv_pwl_w = nullptr; + ggml_tensor * s0_bn2_w = nullptr; + + // Stage 1+ (Universal Inverted Residual) + ggml_tensor * dw_start_w = nullptr; + ggml_tensor * dw_start_bn_w = nullptr; + + ggml_tensor * pw_exp_w = nullptr; + ggml_tensor * pw_exp_bn_w = nullptr; + + ggml_tensor * dw_mid_w = nullptr; + ggml_tensor * dw_mid_bn_w = nullptr; + + ggml_tensor * pw_proj_w = nullptr; + ggml_tensor * pw_proj_bn_w = nullptr; + + ggml_tensor * layer_scale_w = nullptr; + + // Attention (MQA) components + ggml_tensor * attn_q_w = nullptr; + ggml_tensor * attn_k_w = nullptr; + ggml_tensor * attn_v_w = nullptr; + ggml_tensor * attn_o_w = nullptr; + + // Optional downsampling/norm in attention + ggml_tensor * attn_k_dw_w = nullptr; + ggml_tensor * attn_k_norm_w = nullptr; + ggml_tensor * attn_v_dw_w = nullptr; + ggml_tensor * attn_v_norm_w = nullptr; + + // Block norm (often present in attention blocks) + ggml_tensor * attn_norm_w = nullptr; +}; + +struct clip_model { + clip_modality modality = CLIP_MODALITY_VISION; + projector_type proj_type = PROJECTOR_TYPE_MLP; + clip_hparams hparams; + + // embeddings + ggml_tensor * class_embedding = nullptr; + ggml_tensor * patch_embeddings_0 = nullptr; + ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) + ggml_tensor * patch_bias = nullptr; + ggml_tensor * position_embeddings = nullptr; + ggml_tensor * norm_embd_w = nullptr; + ggml_tensor * norm_embd_b = nullptr; + + ggml_tensor * pre_ln_w = nullptr; + ggml_tensor * pre_ln_b = nullptr; + + std::vector<clip_layer> layers; + + int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer + + ggml_tensor * post_ln_w; + ggml_tensor * post_ln_b; + + ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) + ggml_tensor * mm_fc_w; + ggml_tensor * mm_fc_b; + ggml_tensor * mm_ffn_up_w = nullptr; + ggml_tensor * mm_ffn_up_b = nullptr; + ggml_tensor * mm_ffn_gate_w = nullptr; + ggml_tensor * mm_ffn_gate_b = nullptr; + ggml_tensor * mm_ffn_down_w = nullptr; + ggml_tensor * mm_ffn_down_b = nullptr; + ggml_tensor * mm_post_norm_w = nullptr; + ggml_tensor * mm_post_norm_b = nullptr; + + // LLaVA projection + ggml_tensor * mm_input_norm_w = nullptr; + ggml_tensor * mm_input_norm_b = nullptr; + ggml_tensor * mm_0_w = nullptr; + ggml_tensor * mm_0_b = nullptr; + ggml_tensor * mm_2_w = nullptr; + ggml_tensor * mm_2_b = nullptr; + + ggml_tensor * image_newline = nullptr; + + // Yi type models with mlp+normalization projection + ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4 + ggml_tensor * mm_1_b = nullptr; + ggml_tensor * mm_3_w = nullptr; + ggml_tensor * mm_3_b = nullptr; + ggml_tensor * mm_4_w = nullptr; + ggml_tensor * mm_4_b = nullptr; + + // GLMV-Edge projection + ggml_tensor * mm_model_adapter_conv_w = nullptr; + ggml_tensor * mm_model_adapter_conv_b = nullptr; + + // MobileVLM projection + ggml_tensor * mm_model_mlp_1_w = nullptr; + ggml_tensor * mm_model_mlp_1_b = nullptr; + ggml_tensor * mm_model_mlp_3_w = nullptr; + ggml_tensor * mm_model_mlp_3_b = nullptr; + ggml_tensor * mm_model_block_1_block_0_0_w = nullptr; + ggml_tensor * mm_model_block_1_block_0_1_w = nullptr; + ggml_tensor * mm_model_block_1_block_0_1_b = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr; + ggml_tensor * mm_model_block_1_block_2_0_w = nullptr; + ggml_tensor * mm_model_block_1_block_2_1_w = nullptr; + ggml_tensor * mm_model_block_1_block_2_1_b = nullptr; + ggml_tensor * mm_model_block_2_block_0_0_w = nullptr; + ggml_tensor * mm_model_block_2_block_0_1_w = nullptr; + ggml_tensor * mm_model_block_2_block_0_1_b = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr; + ggml_tensor * mm_model_block_2_block_2_0_w = nullptr; + ggml_tensor * mm_model_block_2_block_2_1_w = nullptr; + ggml_tensor * mm_model_block_2_block_2_1_b = nullptr; + + // MobileVLM_V2 projection + ggml_tensor * mm_model_mlp_0_w = nullptr; + ggml_tensor * mm_model_mlp_0_b = nullptr; + ggml_tensor * mm_model_mlp_2_w = nullptr; + ggml_tensor * mm_model_mlp_2_b = nullptr; + ggml_tensor * mm_model_peg_0_w = nullptr; + ggml_tensor * mm_model_peg_0_b = nullptr; + + // MINICPMV projection + ggml_tensor * mm_model_pos_embed_k = nullptr; + ggml_tensor * mm_model_query = nullptr; + ggml_tensor * mm_model_proj = nullptr; + ggml_tensor * mm_model_kv_proj = nullptr; + ggml_tensor * mm_model_attn_q_w = nullptr; + ggml_tensor * mm_model_attn_q_b = nullptr; + ggml_tensor * mm_model_attn_k_w = nullptr; + ggml_tensor * mm_model_attn_k_b = nullptr; + ggml_tensor * mm_model_attn_v_w = nullptr; + ggml_tensor * mm_model_attn_v_b = nullptr; + ggml_tensor * mm_model_attn_o_w = nullptr; + ggml_tensor * mm_model_attn_o_b = nullptr; + ggml_tensor * mm_model_ln_q_w = nullptr; + ggml_tensor * mm_model_ln_q_b = nullptr; + ggml_tensor * mm_model_ln_kv_w = nullptr; + ggml_tensor * mm_model_ln_kv_b = nullptr; + ggml_tensor * mm_model_ln_post_w = nullptr; + ggml_tensor * mm_model_ln_post_b = nullptr; + + // gemma3 + ggml_tensor * mm_input_proj_w = nullptr; + ggml_tensor * mm_soft_emb_norm_w = nullptr; + + // mobilenetv5 for gemma3n + std::vector<mobilenetv5_block> mobilenet_blocks; + std::vector<int> mobilenet_stage_ends; + ggml_tensor * mobilenet_stem_conv_w = nullptr; + ggml_tensor * mobilenet_stem_conv_b = nullptr; + ggml_tensor * mobilenet_stem_norm_w = nullptr; + ggml_tensor * mm_post_proj_norm_w = nullptr; + + // Multi-Scale Fusion Adapter (MSFA) components + ggml_tensor * msfa_concat_conv_w = nullptr; + ggml_tensor * msfa_concat_norm_w = nullptr; + ggml_tensor * msfa_ffn_expand_w = nullptr; + ggml_tensor * msfa_ffn_project_w = nullptr; + ggml_tensor * msfa_ffn_expand_bn = nullptr; + ggml_tensor * msfa_ffn_project_bn = nullptr; + + + // pixtral, glm4v + ggml_tensor * token_embd_img_break = nullptr; + ggml_tensor * mm_patch_merger_w = nullptr; + ggml_tensor * mm_patch_merger_b = nullptr; + + // ultravox / whisper encoder + ggml_tensor * conv1d_1_w = nullptr; + ggml_tensor * conv1d_1_b = nullptr; + ggml_tensor * conv1d_2_w = nullptr; + ggml_tensor * conv1d_2_b = nullptr; + ggml_tensor * mm_norm_pre_w = nullptr; + ggml_tensor * mm_norm_pre_b = nullptr; + ggml_tensor * mm_norm_mid_w = nullptr; + + // cogvlm + ggml_tensor * mm_post_fc_norm_w = nullptr; + ggml_tensor * mm_post_fc_norm_b = nullptr; + ggml_tensor * mm_h_to_4h_w = nullptr; + ggml_tensor * mm_gate_w = nullptr; + ggml_tensor * mm_4h_to_h_w = nullptr; + ggml_tensor * mm_boi = nullptr; + ggml_tensor * mm_eoi = nullptr; + + // lfm2 audio + std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr}; + std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr}; + ggml_tensor * pre_encode_out_w = nullptr; + ggml_tensor * pre_encode_out_b = nullptr; + + bool audio_has_avgpool() const { + return proj_type == PROJECTOR_TYPE_QWEN2A + || proj_type == PROJECTOR_TYPE_VOXTRAL + || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO; + } + + bool audio_has_stack_frames() const { + return proj_type == PROJECTOR_TYPE_ULTRAVOX + || proj_type == PROJECTOR_TYPE_VOXTRAL; + } +}; + +const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx); diff --git a/llama.cpp/tools/mtmd/clip.cpp b/llama.cpp/tools/mtmd/clip.cpp new file mode 100644 index 0000000..eeccb4c --- /dev/null +++ b/llama.cpp/tools/mtmd/clip.cpp @@ -0,0 +1,4080 @@ +#include "clip.h" +#include "clip-impl.h" +#include "clip-model.h" +#include "clip-graph.h" +#include "models/models.h" + +#include "ggml.h" +#include "ggml-cpp.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "gguf.h" + +#include <algorithm> +#include <cassert> +#include <cmath> +#include <cstdlib> +#include <cstring> +#include <fstream> +#include <map> +#include <stdexcept> +#include <unordered_set> +#include <vector> +#include <cinttypes> +#include <limits> +#include <array> +#include <functional> + +struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL}; + +//#define CLIP_DEBUG_FUNCTIONS + +#ifdef CLIP_DEBUG_FUNCTIONS +static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); + return; + } + + // PPM header: P6 format, width, height, and max color value + file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + + // Write pixel data + for (size_t i = 0; i < img.buf.size(); i += 3) { + // PPM expects binary data in RGB format, which matches our image buffer + file.write(reinterpret_cast<const char*>(&img.buf[i]), 3); + } + + file.close(); +} + +static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); + return; + } + + int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + int bytesPerPixel = 3; + int widthInBytes = img.nx * bytesPerPixel; + int paddingAmount = (4 - (widthInBytes % 4)) % 4; + int stride = widthInBytes + paddingAmount; + + // Bitmap file header + unsigned char fileHeader[14] = { + 'B','M', // Signature + 0,0,0,0, // Image file size in bytes + 0,0,0,0, // Reserved + 54,0,0,0 // Start of pixel array + }; + + // Total file size + fileSize = 54 + (stride * img.ny); + fileHeader[2] = (unsigned char)(fileSize); + fileHeader[3] = (unsigned char)(fileSize >> 8); + fileHeader[4] = (unsigned char)(fileSize >> 16); + fileHeader[5] = (unsigned char)(fileSize >> 24); + + // Bitmap information header (BITMAPINFOHEADER) + unsigned char infoHeader[40] = { + 40,0,0,0, // Size of this header (40 bytes) + 0,0,0,0, // Image width + 0,0,0,0, // Image height + 1,0, // Number of color planes + 24,0, // Bits per pixel + 0,0,0,0, // No compression + 0,0,0,0, // Image size (can be 0 for no compression) + 0,0,0,0, // X pixels per meter (not specified) + 0,0,0,0, // Y pixels per meter (not specified) + 0,0,0,0, // Total colors (color table not used) + 0,0,0,0 // Important colors (all are important) + }; + + // Width and height in the information header + infoHeader[4] = (unsigned char)(img.nx); + infoHeader[5] = (unsigned char)(img.nx >> 8); + infoHeader[6] = (unsigned char)(img.nx >> 16); + infoHeader[7] = (unsigned char)(img.nx >> 24); + infoHeader[8] = (unsigned char)(img.ny); + infoHeader[9] = (unsigned char)(img.ny >> 8); + infoHeader[10] = (unsigned char)(img.ny >> 16); + infoHeader[11] = (unsigned char)(img.ny >> 24); + + // Write file headers + file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader)); + file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader)); + + // Pixel data + std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row + for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < img.nx; ++x) { + // Each pixel + size_t pixelIndex = (y * img.nx + x) * 3; + unsigned char pixel[3] = { + img.buf[pixelIndex + 2], // BMP stores pixels in BGR format + img.buf[pixelIndex + 1], + img.buf[pixelIndex] + }; + file.write(reinterpret_cast<char*>(pixel), 3); + } + // Write padding for the row + file.write(reinterpret_cast<char*>(padding.data()), paddingAmount); + } + + file.close(); +} + +// debug function to convert f32 to u8 +static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(3 * src.nx * src.ny); + for (size_t i = 0; i < src.buf.size(); ++i) { + dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + } +} +#endif + + +struct clip_ctx { + clip_model model; + + gguf_context_ptr ctx_gguf; + ggml_context_ptr ctx_data; + + std::vector<uint8_t> buf_compute_meta; + + std::vector<ggml_backend_t> backend_ptrs; + std::vector<ggml_backend_buffer_type_t> backend_buft; + + ggml_backend_t backend = nullptr; + ggml_backend_t backend_cpu = nullptr; + ggml_backend_buffer_ptr buf; + + + int max_nodes = 8192; + ggml_backend_sched_ptr sched; + clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO; + bool is_allocated = false; + + clip_ctx(clip_context_params & ctx_params) { + flash_attn_type = ctx_params.flash_attn_type; + backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + if (!backend_cpu) { + throw std::runtime_error("failed to initialize CPU backend"); + } + if (ctx_params.use_gpu) { + auto backend_name = std::getenv("MTMD_BACKEND_DEVICE"); + if (backend_name != nullptr) { + backend = ggml_backend_init_by_name(backend_name, nullptr); + if (!backend) { + LOG_WRN("%s: Warning: Failed to initialize \"%s\" backend, falling back to default GPU backend\n", __func__, backend_name); + } + } + if (!backend) { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr); + backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr); + } + } + + if (backend) { + LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend)); + backend_ptrs.push_back(backend); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); + } else { + backend = backend_cpu; + LOG_INF("%s: CLIP using CPU backend\n", __func__); + } + + if (ctx_params.image_min_tokens > 0) { + model.hparams.custom_image_min_tokens = ctx_params.image_min_tokens; + } + if (ctx_params.image_max_tokens > 0) { + model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens; + } + + backend_ptrs.push_back(backend_cpu); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); + + sched.reset( + ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true) + ); + + if (ctx_params.cb_eval != nullptr) { + ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data); + } + } + + ~clip_ctx() { + ggml_backend_free(backend); + if (backend != backend_cpu) { + ggml_backend_free(backend_cpu); + } + } + + // this function is added so that we don't change too much of the existing code + projector_type proj_type() const { + return model.proj_type; + } +}; + +// +// clip_graph +// + +clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : + model(ctx->model), + hparams(model.hparams), + proj_type(ctx->proj_type()), + img(img), + patch_size(hparams.patch_size), + n_patches_x(img.nx / patch_size), + n_patches_y(img.ny / patch_size), + n_patches(n_patches_x * n_patches_y), + n_embd(hparams.n_embd), + n_head(hparams.n_head), + d_head(n_embd / n_head), + n_layer(hparams.n_layer), + n_mmproj_embd(clip_n_mmproj_embd(ctx)), + eps(hparams.eps), + kq_scale(1.0f / sqrtf((float)d_head)), + flash_attn_type(ctx->flash_attn_type) { + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + ctx0_ptr.reset(ggml_init(params)); + ctx0 = ctx0_ptr.get(); + gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false); +} + +void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const { + if (il >= 0) { + ggml_format_name(cur, "%s-%d", name, il); + } else { + ggml_set_name(cur, name); + } +} + +// siglip2 naflex +ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) { + ggml_tensor * pos_embd = model.position_embeddings; + const int height = img.ny / patch_size; + const int width = img.nx / patch_size; + const uint32_t mode = interpolation_mode; + const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); + + GGML_ASSERT(pos_embd); + + if (height == n_per_side && width == n_per_side) { + return pos_embd; + } + + pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side) + pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd) + pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd) + pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height) + pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height) + + return pos_embd; +} + +// build vision transformer (ViT) cgraph +// this function should cover most of the models +// if your model has specific features, you should probably duplicate this function +ggml_tensor * clip_graph::build_vit( + ggml_tensor * inp, + int64_t n_pos, + norm_type norm_t, + ffn_op_type ffn_t, + ggml_tensor * learned_pos_embd, + std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos + ) { + if (learned_pos_embd) { + inp = ggml_add(ctx0, inp, learned_pos_embd); + cb(inp, "pos_embed", -1); + } + + ggml_tensor * inpL = inp; + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + cb(inpL, "pre_ln", -1); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "layer_inp_normed", il); + + // self-attention + { + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; + if (layer.qkv_w != nullptr) { + // fused qkv + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + if (layer.qkv_b != nullptr) { + cur = ggml_add(ctx0, cur, layer.qkv_b); + } + + Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ 0); + + Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, n_embd)); + + Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, 2 * n_embd)); + + // TODO: q/k norm requires row size == n_embd, while here it's d_head + // we can add support in the future if needed + GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr); + + } else { + // separate q, k, v + Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + if (layer.q_b) { + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + } + + Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + if (layer.k_b) { + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + } + + Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + if (layer.v_b) { + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + } + + if (layer.q_norm) { + Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); + cb(Qcur, "Qcur_norm", il); + } + + if (layer.k_norm) { + Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); + cb(Kcur, "Kcur_norm", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + if (add_pos) { + Qcur = add_pos(Qcur, layer); + Kcur = add_pos(Kcur, layer); + cb(Qcur, "Qcur_pos", il); + cb(Kcur, "Kcur_pos", il); + } + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + if (layer.ls_1_w) { + cur = ggml_mul(ctx0, cur, layer.ls_1_w); + cb(cur, "attn_out_scaled", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + ffn_t, il); + + cb(cur, "ffn_out", il); + + if (layer.ls_2_w) { + cur = ggml_mul(ctx0, cur, layer.ls_2_w); + cb(cur, "ffn_out_scaled", il); + } + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + if (model.audio_has_avgpool()) { + ggml_tensor * cur = inpL; + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0); + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1); + } + return inpL; +} + +// build the input after conv2d (inp_raw --> patches) +// returns tensor with shape [n_embd, n_patches] +ggml_tensor * clip_graph::build_inp() { + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + if (model.patch_bias) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + } + return inp; +} + +ggml_tensor * clip_graph::build_inp_raw(int channels) { + ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + return inp_raw; +} + +ggml_tensor * clip_graph::build_norm( + ggml_tensor * cur, + ggml_tensor * mw, + ggml_tensor * mb, + norm_type type, + float norm_eps, + int il) const { + + cur = type == NORM_TYPE_RMS + ? ggml_rms_norm(ctx0, cur, norm_eps) + : ggml_norm(ctx0, cur, norm_eps); + + if (mw) { + cur = ggml_mul(ctx0, cur, mw); + cb(cur, "norm_w", il); + } + + if (mb) { + cur = ggml_add(ctx0, cur, mb); + cb(cur, "norm_b", il); + } + + return cur; +} + +ggml_tensor * clip_graph::build_ffn( + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * down, + ggml_tensor * down_b, + ffn_op_type type_op, + int il) const { + + ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur; + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx0, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (gate) { + cur = ggml_mul_mat(ctx0, gate, cur); + cb(cur, "ffn_gate", il); + + if (gate_b) { + cur = ggml_add(ctx0, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + } else { + cur = tmp; + } + + // we only support parallel ffn for now + switch (type_op) { + case FFN_SILU: + if (gate) { + cur = ggml_swiglu_split(ctx0, cur, tmp); + cb(cur, "ffn_swiglu", il); + } else { + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_silu", il); + } break; + case FFN_GELU: + if (gate) { + cur = ggml_geglu_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu", il); + } else { + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_gelu", il); + } break; + case FFN_GELU_ERF: + if (gate) { + cur = ggml_geglu_erf_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu_erf", il); + } else { + cur = ggml_gelu_erf(ctx0, cur); + cb(cur, "ffn_gelu_erf", il); + } break; + case FFN_GELU_QUICK: + if (gate) { + cur = ggml_geglu_quick_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu_quick", il); + } else { + cur = ggml_gelu_quick(ctx0, cur); + cb(cur, "ffn_gelu_quick", il); + } break; + } + + if (down) { + cur = ggml_mul_mat(ctx0, down, cur); + } + + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx0, cur, down_b); + } + + return cur; +} + +ggml_tensor * clip_graph::build_attn( + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale, + int il) const { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, k_cur); + ggml_build_forward_expand(gf, v_cur); + + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); + //cb(k, "k", il); + + ggml_tensor * cur; + + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); + + k = ggml_cast(ctx0, k, GGML_TYPE_F16); + v = ggml_cast(ctx0, v, GGML_TYPE_F16); + + cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f); + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]); + + } else { + ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); + v = ggml_cont(ctx0, v); + + const auto n_tokens = q->ne[1]; + const auto n_head = q->ne[2]; + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // F32 may not needed for vision encoders? + // ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens); + } + + cb(cur, "kqv_out", il); + + if (wo) { + cur = ggml_mul_mat(ctx0, wo, cur); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; +} + +// implementation of the 2D RoPE without adding a new op in ggml +// this is not efficient (use double the memory), but works on all backends +// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 +ggml_tensor * clip_graph::build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_a, // first half + ggml_tensor * pos_b, // second half + const float freq_base, + const bool interleave_freq +) { + const int64_t n_dim = cur->ne[0]; + const int64_t n_head = cur->ne[1]; + const int64_t n_pos = cur->ne[2]; + + // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) + // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 + // first half of cur will use 1e-0, 1e-2 (even) + // second half of cur will use 1e-1, 1e-3 (odd) + // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even + // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2) + // then for the second half, we use freq_scale to shift the inv_freq + // ^ why? replace (2i) with (2i+1) in the above equation + const float freq_scale_odd = interleave_freq + ? std::pow(freq_base, (float)-2/n_dim) + : 1.0; + + // first half + ggml_tensor * first; + { + first = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + cur->nb[1], + cur->nb[2], + 0); + first = ggml_rope_ext( + ctx0, + first, + pos_a, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + // second half + ggml_tensor * second; + { + second = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + cur->nb[1], + cur->nb[2], + n_dim/2 * ggml_element_size(cur)); + second = ggml_rope_ext( + ctx0, + second, + pos_b, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + freq_scale_odd, + 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + cur = ggml_concat(ctx0, first, second, 0); + return cur; +} + +// Generic function to stack frames for audio processing +// Abstracts out the StackAudioFrames logic used by ultravox +ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) { + if (stack_factor <= 1) { + return cur; + } + + int64_t total_elements = ggml_nelements(cur); + int64_t stride = n_embed * stack_factor; + + // Calculate padded length + int64_t padded_len = GGML_PAD(total_elements, stride); + int64_t pad = padded_len - total_elements; + + if (pad > 0) { + // Pad the tensor to make it divisible by stride + cur = ggml_view_1d(ctx0, cur, total_elements, 0); + cur = ggml_pad(ctx0, cur, pad, 0, 0, 0); + } + + // Reshape to [stride, padded_len / stride] + cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride, + ggml_row_size(cur->type, stride), 0); + return cur; +} + +// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) +// support dynamic resolution +ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) { + GGML_ASSERT(scale_factor > 1); + + const int n_embd = cur->ne[0]; + int width = img.nx / patch_size; + int height = img.ny / patch_size; + + // pad width and height to factor + const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; + const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; + cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); + if (pad_width || pad_height) { + cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); + width += pad_width; + height += pad_height; + } + + // unshuffle h + cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + // unshuffle w + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cb(cur, "pixel_shuffle", -1); + + return cur; +} + +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { + GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported"); + + const clip_image_f32 & img = *imgs.entries[0]; + std::unique_ptr<clip_graph> builder; + + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_JANUS_PRO: + { + builder = std::make_unique<clip_graph_siglip>(ctx, img); + } break; + case PROJECTOR_TYPE_GEMMA3NV: + { + builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img); + } break; + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + { + builder = std::make_unique<clip_graph_pixtral>(ctx, img); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + { + builder = std::make_unique<clip_graph_qwen2vl>(ctx, img); + } break; + case PROJECTOR_TYPE_QWEN3VL: + { + builder = std::make_unique<clip_graph_qwen3vl>(ctx, img); + } break; + case PROJECTOR_TYPE_MINICPMV: + { + builder = std::make_unique<clip_graph_minicpmv>(ctx, img); + } break; + case PROJECTOR_TYPE_INTERNVL: + { + builder = std::make_unique<clip_graph_internvl>(ctx, img); + } break; + case PROJECTOR_TYPE_LLAMA4: + { + builder = std::make_unique<clip_graph_llama4>(ctx, img); + } break; + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + { + builder = std::make_unique<clip_graph_whisper_enc>(ctx, img); + } break; + case PROJECTOR_TYPE_KIMIVL: + { + builder = std::make_unique<clip_graph_kimivl>(ctx, img); + } break; + case PROJECTOR_TYPE_KIMIK25: + { + builder = std::make_unique<clip_graph_kimik25>(ctx, img); + } break; + case PROJECTOR_TYPE_COGVLM: + { + builder = std::make_unique<clip_graph_cogvlm>(ctx, img); + } break; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_GLM_EDGE: + { + builder = std::make_unique<clip_graph_llava>(ctx, img); + } break; + case PROJECTOR_TYPE_LFM2A: + { + builder = std::make_unique<clip_graph_conformer>(ctx, img); + } break; + case PROJECTOR_TYPE_GLM4V: + { + builder = std::make_unique<clip_graph_glm4v>(ctx, img); + } break; + case PROJECTOR_TYPE_YOUTUVL: + { + builder = std::make_unique<clip_graph_youtuvl>(ctx, img); + } break; + default: + GGML_ABORT("missing cgraph builder"); + } + + return builder->build(); +} + +// +// clip_model_loader +// + +struct clip_model_loader { + ggml_context_ptr ctx_meta; + gguf_context_ptr ctx_gguf; + + std::string fname; + + size_t model_size = 0; // in bytes + + bool has_vision = false; + bool has_audio = false; + + // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model + clip_model_loader(const char * fname) : fname(fname) { + struct ggml_context * meta = nullptr; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params)); + if (!ctx_gguf.get()) { + throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname)); + } + + ctx_meta.reset(meta); + + const int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); + + // print gguf info + { + std::string name; + get_string(KEY_NAME, name, false); + std::string description; + get_string(KEY_DESCRIPTION, description, false); + LOG_INF("%s: model name: %s\n", __func__, name.c_str()); + LOG_INF("%s: description: %s\n", __func__, description.c_str()); + LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get())); + LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get())); + LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors); + LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get())); + LOG_INF("\n"); + } + + // modalities + { + get_bool(KEY_HAS_VISION_ENC, has_vision, false); + get_bool(KEY_HAS_AUDIO_ENC, has_audio, false); + + if (has_vision) { + LOG_INF("%s: has vision encoder\n", __func__); + } + if (has_audio) { + LOG_INF("%s: has audio encoder\n", __func__); + } + } + + // tensors + { + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); + const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i); + enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i); + ggml_tensor * cur = ggml_get_tensor(meta, name); + size_t tensor_size = ggml_nbytes(cur); + model_size += tensor_size; + LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); + } + } + } + + void load_hparams(clip_model & model, clip_modality modality) { + auto & hparams = model.hparams; + std::string log_ffn_op; // for logging + + // sanity check + if (modality == CLIP_MODALITY_VISION) { + GGML_ASSERT(has_vision); + } else if (modality == CLIP_MODALITY_AUDIO) { + GGML_ASSERT(has_audio); + } + model.modality = modality; + + + // projector type + std::string proj_type; + { + // default key + get_string(KEY_PROJ_TYPE, proj_type, false); + + // for models with mixed modalities + if (proj_type.empty()) { + if (modality == CLIP_MODALITY_VISION) { + get_string(KEY_VISION_PROJ_TYPE, proj_type, false); + } else if (modality == CLIP_MODALITY_AUDIO) { + get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false); + } else { + GGML_ABORT("unknown modality"); + } + } + + model.proj_type = clip_projector_type_from_string(proj_type); + + if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) { + throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str())); + } + + // correct arch for multimodal models (legacy method) + if (model.proj_type == PROJECTOR_TYPE_QWEN25O) { + model.proj_type = modality == CLIP_MODALITY_VISION + ? PROJECTOR_TYPE_QWEN25VL + : PROJECTOR_TYPE_QWEN2A; + } + } + + const bool is_vision = model.modality == CLIP_MODALITY_VISION; + const bool is_audio = model.modality == CLIP_MODALITY_AUDIO; + + // other hparams + { + const char * prefix = is_vision ? "vision" : "audio"; + get_u32(string_format(KEY_N_EMBD, prefix), hparams.n_embd); + get_u32(string_format(KEY_N_HEAD, prefix), hparams.n_head); + get_u32(string_format(KEY_N_FF, prefix), hparams.n_ff); + get_u32(string_format(KEY_N_BLOCK, prefix), hparams.n_layer); + get_u32(string_format(KEY_PROJ_DIM, prefix), hparams.projection_dim); + get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps); + + if (is_vision) { + get_u32(KEY_IMAGE_SIZE, hparams.image_size); + get_u32(KEY_PATCH_SIZE, hparams.patch_size); + get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); + get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy + get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false); + if (hparams.minicpmv_query_num == 0) { + // Fallback to hardcoded values for legacy models + if (hparams.minicpmv_version == 3) { + hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 4) { + hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 5) { + hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 6) { + hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 100045) { + hparams.minicpmv_query_num = 64; + } else { + hparams.minicpmv_query_num = 96; + } + } + } else if (is_audio) { + get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins); + // some hparams are unused, but still need to set to avoid issues + hparams.image_size = 0; + hparams.patch_size = 1; + + } else { + GGML_ASSERT(false && "unknown modality"); + } + + // for pinpoints, we need to convert it into a list of resolution candidates + { + std::vector<int> pinpoints; + get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false); + if (!pinpoints.empty()) { + for (size_t i = 0; i < pinpoints.size(); i += 2) { + hparams.image_res_candidates.push_back({ + pinpoints[i], + pinpoints[i+1], + }); + } + } + } + + // default warmup value + hparams.warmup_image_size = hparams.image_size; + + hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP + || model.proj_type == PROJECTOR_TYPE_MLP_NORM + || model.proj_type == PROJECTOR_TYPE_LDP + || model.proj_type == PROJECTOR_TYPE_LDPV2; + + { + bool use_gelu = false; + bool use_silu = false; + get_bool(KEY_USE_GELU, use_gelu, false); + get_bool(KEY_USE_SILU, use_silu, false); + if (use_gelu && use_silu) { + throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__)); + } + if (use_gelu) { + hparams.ffn_op = FFN_GELU; + log_ffn_op = "gelu"; + } else if (use_silu) { + hparams.ffn_op = FFN_SILU; + log_ffn_op = "silu"; + } else { + hparams.ffn_op = FFN_GELU_QUICK; + log_ffn_op = "gelu_quick"; + } + } + + { + std::string mm_patch_merge_type; + get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false); + if (mm_patch_merge_type == "spatial_unpad") { + hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD; + } + } + + if (is_vision) { + int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN); + int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD); + GGML_ASSERT(idx_mean >= 0 && "image_mean not found"); + GGML_ASSERT(idx_std >= 0 && "image_std not found"); + const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean); + const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std); + for (int i = 0; i < 3; ++i) { + hparams.image_mean[i] = mean_data[i]; + hparams.image_std[i] = std_data[i]; + } + } + + // Load the vision feature layer indices if they are explicitly provided; + // if multiple vision feature layers are present, the values will be concatenated + // to form the final visual features. + // NOTE: gguf conversions should standardize the values of the vision feature layer to + // be non-negative, since we use -1 to mark values as unset here. + std::vector<int> vision_feature_layer; + get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false); + // convert std::vector to std::unordered_set + for (auto & layer : vision_feature_layer) { + hparams.vision_feature_layer.insert(layer); + } + + // model-specific params + switch (model.proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + if (hparams.minicpmv_version == 0) { + hparams.minicpmv_version = 2; // default to 2 if not set + } + } break; + case PROJECTOR_TYPE_INTERNVL: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + } break; + case PROJECTOR_TYPE_IDEFICS3: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); + } break; + case PROJECTOR_TYPE_LFM2: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json + hparams.set_limit_image_tokens(64, 256); + } break; + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + { + // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json + // TODO: verify the image_min_tokens + hparams.n_merge = 1; // the original pixtral does not use patch merging + hparams.rope_theta = 10000.0f; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup + } break; + case PROJECTOR_TYPE_KIMIVL: + { + hparams.rope_theta = 10000.0f; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + // TODO: check kimivl preprocessor for exact values + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup + } break; + case PROJECTOR_TYPE_KIMIK25: + { + hparams.rope_theta = 10000.0f; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + + int min_pixels = 0, max_pixels = 0; + get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false); + get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false); + if (min_pixels > 0 && max_pixels > 0) { + hparams.image_min_pixels = min_pixels; + hparams.image_max_pixels = max_pixels; + hparams.warmup_image_size = static_cast<int>(std::sqrt(max_pixels)); + } else { + hparams.set_limit_image_tokens(2, 4096); + } + } break; + case PROJECTOR_TYPE_GEMMA3: + { + // default value (used by all model sizes in gemma 3 family) + // number of patches for each **side** is reduced by a factor of 4 + hparams.n_merge = 4; + // test model (tinygemma3) has a different value, we optionally read it + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + } break; + + case PROJECTOR_TYPE_GEMMA3NV: + { + // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16) + // Similar configuration to Gemma3 + hparams.n_merge = 1; // MobileNetV5 handles resizing internally + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + { + hparams.n_merge = 2; // default value for Qwen 2 and 2.5 + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it + // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json + hparams.set_limit_image_tokens(8, 4096); + hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup + const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size; + if (hparams.image_min_pixels < warn_min_pixels) { + LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__); + LOG_WRN("%s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n", __func__); + LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); + } + } break; + case PROJECTOR_TYPE_YOUTUVL: + { + hparams.n_merge = 2; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); + std::vector<int> wa_layer_indexes_vec; + get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true); + for (auto & layer : wa_layer_indexes_vec) { + hparams.wa_layer_indexes.insert(layer); + } + // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens + hparams.set_limit_image_tokens(1, 62500); + hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup + } break; + case PROJECTOR_TYPE_GLM4V: + { + hparams.rope_theta = 10000.0f; + hparams.n_merge = 2; // default value for GLM4-V + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(8, 4096); + hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup + } break; + case PROJECTOR_TYPE_LLAMA4: + { + hparams.rope_theta = 10000.0f; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + set_llava_uhd_res_candidates(model, 3); + } break; + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + { + bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX || + model.proj_type == PROJECTOR_TYPE_VOXTRAL || + model.proj_type == PROJECTOR_TYPE_GLMA; + get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack); + hparams.ffn_op = FFN_GELU_ERF; + log_ffn_op = "gelu_erf"; // temporary solution for logging + + // audio preprocessing params + hparams.audio_chunk_len = 30; // in seconds + hparams.audio_sample_rate = 16000; + hparams.audio_n_fft = 400; + hparams.audio_window_len = 400; + hparams.audio_hop_len = 160; + } break; + case PROJECTOR_TYPE_LFM2A: + { + // audio preprocessing params + hparams.audio_chunk_len = 1; // in seconds + hparams.audio_sample_rate = 16000; + hparams.audio_n_fft = 512; + hparams.audio_window_len = 400; + hparams.audio_hop_len = 160; + } break; + default: + break; + } + + // sanity check + { + if (hparams.image_max_pixels < hparams.image_min_pixels) { + throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels)); + } + } + + LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str()); + LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd); + LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head); + LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff); + LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer); + LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str()); + LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim); + if (is_vision) { + LOG_INF("\n--- vision hparams ---\n"); + LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size); + LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size); + LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector); + LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version); + LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge); + LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); + if (!hparams.wa_layer_indexes.empty()) { + LOG_INF("%s: wa_layer_indexes: ", __func__); + for (auto & layer : hparams.wa_layer_indexes) { + LOG_INF("%d ", layer); + } + LOG_INF("\n"); + } + if (hparams.image_min_pixels > 0) { + LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : ""); + } + if (hparams.image_max_pixels > 0) { + LOG_INF("%s: image_max_pixels: %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : ""); + } + } else if (is_audio) { + LOG_INF("\n--- audio hparams ---\n"); + LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins); + LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor); + LOG_INF("%s: audio_chunk_len: %d\n", __func__, hparams.audio_chunk_len); + LOG_INF("%s: audio_sample_rate: %d\n", __func__, hparams.audio_sample_rate); + LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft); + LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len); + LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len); + } + LOG_INF("\n"); + LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); + LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); + } + } + + void load_tensors(clip_ctx & ctx_clip) { + auto & model = ctx_clip.model; + auto & hparams = model.hparams; + std::map<std::string, size_t> tensor_offset; + std::vector<ggml_tensor *> tensors_to_load; + + // TODO @ngxson : support both audio and video in the future + const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v"; + + // get offsets + for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); + tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i); + } + + // create data context + struct ggml_init_params params = { + /*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_clip.ctx_data.reset(ggml_init(params)); + if (!ctx_clip.ctx_data) { + throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__)); + } + + // helper function + auto get_tensor = [&](const std::string & name, bool required = true) { + ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str()); + if (!cur && required) { + throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str())); + } + if (cur) { + tensors_to_load.push_back(cur); + // add tensors to context + ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur); + ggml_set_name(data_tensor, cur->name); + cur = data_tensor; + } + return cur; + }; + + model.class_embedding = get_tensor(TN_CLASS_EMBD, false); + + model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false); + model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false); + + model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false); + model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false); + + model.patch_bias = get_tensor(TN_PATCH_BIAS, false); + model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); + model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); + + model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false); + model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false); + + model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false); + + if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) { + hparams.n_layer = 0; // gemma3n does not use normal layer structure + } + + // layers + model.layers.resize(hparams.n_layer); + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = model.layers[il]; + layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false); + layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false); + layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false); + layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight")); + layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false); + layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false); + layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false); + layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false); + layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false); + layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias + layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias + + layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false); + layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false); + layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false); + layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false); + layer.qkv_b = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "bias"), false); + layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false); + layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false); + + // ffn + layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight")); + layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias"), false); + layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false); + layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false); + layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight")); + layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false); + + + // qwen3vl deepstack layer + layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false); + layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false); + layer.deepstack_fc1_w = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "weight"), false); + layer.deepstack_fc1_b = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "bias"), false); + layer.deepstack_fc2_w = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "weight"), false); + layer.deepstack_fc2_b = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "bias"), false); + if (layer.has_deepstack()) { + model.n_deepstack_layers++; + } + + // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here + // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check! + bool is_ffn_swapped = ( + // only old models need this fix + model.proj_type == PROJECTOR_TYPE_MLP + || model.proj_type == PROJECTOR_TYPE_MLP_NORM + || model.proj_type == PROJECTOR_TYPE_LDP + || model.proj_type == PROJECTOR_TYPE_LDPV2 + || model.proj_type == PROJECTOR_TYPE_QWEN2VL + || model.proj_type == PROJECTOR_TYPE_QWEN25VL + || model.proj_type == PROJECTOR_TYPE_GLM_EDGE + || model.proj_type == PROJECTOR_TYPE_GEMMA3 + || model.proj_type == PROJECTOR_TYPE_IDEFICS3 + || model.proj_type == PROJECTOR_TYPE_MINICPMV + ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd; + if (is_ffn_swapped) { + // swap up and down weights + ggml_tensor * tmp = layer.ff_up_w; + layer.ff_up_w = layer.ff_down_w; + layer.ff_down_w = tmp; + // swap up and down biases + tmp = layer.ff_up_b; + layer.ff_up_b = layer.ff_down_b; + layer.ff_down_b = tmp; + if (il == 0) { + LOG_WRN("%s: ffn up/down are swapped\n", __func__); + } + } + } + + + switch (model.proj_type) { + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + { + // LLaVA projection + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false); + // Yi-type llava + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + // missing in Yi-type llava + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); + // Yi-type llava + model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false); + model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false); + model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false); + model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false); + if (model.mm_3_w) { + // TODO: this is a hack to support Yi-type llava + model.proj_type = PROJECTOR_TYPE_MLP_NORM; + } + model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false); + } break; + case PROJECTOR_TYPE_LDP: + { + // MobileVLM projection + model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); + model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias")); + model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); + model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias")); + model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); + model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); + model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); + model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); + model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); + model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); + model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); + model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); + model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); + model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); + model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); + model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); + model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); + model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); + model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); + model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); + model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); + model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); + model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); + model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); + } break; + case PROJECTOR_TYPE_LDPV2: + { + // MobilVLM_V2 projection + model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); + model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias")); + model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); + model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias")); + model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight")); + model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias")); + } break; + case PROJECTOR_TYPE_MINICPMV: + { + // model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); + model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K); + model.mm_model_query = get_tensor(TN_MINICPMV_QUERY); + model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ); + model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ); + model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight")); + model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight")); + model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight")); + model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias")); + model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias")); + model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias")); + model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight")); + model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias")); + model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight")); + model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias")); + model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight")); + model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias")); + model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight")); + model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias")); + } break; + case PROJECTOR_TYPE_GLM_EDGE: + { + model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight")); + model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias")); + model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight")); + model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight")); + model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias")); + model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight")); + model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight")); + model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight")); + model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI)); + model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI)); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + { + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_QWEN3VL: + { + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_YOUTUVL: + { + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm) + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0 + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2 + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_GLM4V: + { + model.projection = get_tensor(TN_MM_PROJECTOR); + model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight")); + model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false); + model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); + model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false); + model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight")); + model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false); + model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight")); + model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight")); + model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias")); + } break; + case PROJECTOR_TYPE_GEMMA3: + { + model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); + model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); + } break; + case PROJECTOR_TYPE_GEMMA3NV: + { + model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false); + model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false); + model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false); + + model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false); + model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded + model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false); + model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false); + + model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false); + + // Dynamically load blocks stage by stage + for (int stage = 0; stage < 4; ++stage) { + int blocks_found_in_stage = 0; + + for (int blk_idx = 0; ; ++blk_idx) { + bool found_block = false; + mobilenetv5_block block; + + // 1. Check for Edge Residual (S0) + block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false); + if (block.s0_conv_exp_w) { + found_block = true; + block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false); + block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false); + block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false); + } + // 2. Check for UIR (Universal Inverted Residual) + else { + // Check for dw_start OR pw_exp (some UIR blocks skip dw_start) + block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false); + block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false); + + if (block.dw_start_w || block.pw_exp_w) { + found_block = true; + if (block.dw_start_w) { + block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false); + } + if (block.pw_exp_w) { + block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false); + } + block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false); + if (block.dw_mid_w) { + block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false); + } + block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false); + if (block.pw_proj_w) { + block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false); + } + block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false); + } + } + + // 3. Check for Attention (MQA) + // Even if UIR/Edge check failed, this might be a pure attention block + ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false); + if (attn_q_check) { + found_block = true; + block.attn_q_w = attn_q_check; + block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false); + block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false); + block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false); + block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false); + block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false); + block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false); + block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false); + block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false); + // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check + if (!block.layer_scale_w) { + block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false); + } + } + + if (found_block) { + model.mobilenet_blocks.push_back(block); + blocks_found_in_stage++; + } else { + // End of blocks for this stage + break; + } + } + + // Track where this stage ends in the flat vector + if (blocks_found_in_stage > 0) { + model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1); + LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1); + } + } + model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); + model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); + } break; + case PROJECTOR_TYPE_IDEFICS3: + { + model.projection = get_tensor(TN_MM_PROJECTOR); + } break; + case PROJECTOR_TYPE_LFM2: + { + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_KIMIVL: + case PROJECTOR_TYPE_KIMIK25: + { + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); + model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); + // [IMG_BREAK] token embedding + model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK); + // for mistral small 3.1 + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false); + } break; + case PROJECTOR_TYPE_LIGHTONOCR: + { + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false); + } break; + case PROJECTOR_TYPE_ULTRAVOX: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight")); + model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight")); + } break; + case PROJECTOR_TYPE_QWEN2A: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight")); + model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias")); + } break; + case PROJECTOR_TYPE_VOXTRAL: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + } break; + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias")); + } break; + case PROJECTOR_TYPE_INTERNVL: + { + model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias")); + model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); + model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias")); + } break; + case PROJECTOR_TYPE_GLMA: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias")); + model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight")); + model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias")); + model.mm_boi = get_tensor(string_format(TN_TOK_BOI)); + model.mm_eoi = get_tensor(string_format(TN_TOK_EOI)); + } break; + case PROJECTOR_TYPE_LLAMA4: + { + model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); + model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); + } break; + case PROJECTOR_TYPE_COGVLM: + { + model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight")); + model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias")); + model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight")); + model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); + model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight")); + model.mm_boi = get_tensor(TN_TOK_BOI); + model.mm_eoi = get_tensor(TN_TOK_EOI); + } break; + case PROJECTOR_TYPE_JANUS_PRO: + { + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + } break; + case PROJECTOR_TYPE_LFM2A: + { + for (int i : {0, 2, 3, 5, 6}) { + model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight")); + model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias")); + } + model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight")); + model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias")); + + model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight")); + model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias")); + + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = model.layers[il]; + + layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight")); + layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias")); + layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight")); + layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias")); + layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight")); + layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias")); + layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight")); + layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias")); + + layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il)); + layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il)); + + layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight")); + layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias")); + + layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight")); + + layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight")); + layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias")); + layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight")); + layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias")); + layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight")); + layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias")); + layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight")); + layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias")); + } + } break; + default: + GGML_ASSERT(false && "unknown projector type"); + } + + // load data + { + std::vector<uint8_t> read_buf; + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); + } + + // alloc memory and offload data + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend); + ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft)); + ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + for (auto & t : tensors_to_load) { + ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name); + const size_t offset = tensor_offset[t->name]; + fin.seekg(offset, std::ios::beg); + if (!fin) { + throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name)); + } + size_t num_bytes = ggml_nbytes(cur); + if (ggml_backend_buft_is_host(buft)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast<char *>(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + } + fin.close(); + + LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); + } + } + + struct support_info_op { + ggml_tensor * op; + + // true if the op runs on the accelerated ctx_clip.backend + bool is_accel = true; + }; + + struct support_info_graph { + // whether the clip_ctx.backend supports flash attention + bool fattn = true; + ggml_tensor * fattn_op = nullptr; // for debugging + + std::vector<support_info_op> ops; + }; + + static void warmup(clip_ctx & ctx_clip) { + // create a fake batch + const auto & hparams = ctx_clip.model.hparams; + clip_image_f32_batch batch; + clip_image_f32_ptr img(clip_image_f32_init()); + if (ctx_clip.model.modality == CLIP_MODALITY_VISION) { + img->nx = hparams.warmup_image_size; + img->ny = hparams.warmup_image_size; + LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny); + } else { + img->nx = hparams.warmup_audio_size; + img->ny = hparams.n_mel_bins; + LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx); + } + batch.entries.push_back(std::move(img)); + warmup(ctx_clip, batch); + } + + static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { + support_info_graph info; + + if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) { + // try to enable flash attention to see if it's supported + ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED; + info = alloc_compute_meta(ctx_clip, batch); + if (!info.fattn && info.fattn_op) { + auto op = info.fattn_op; + LOG_WRN("%s: *****************************************************************\n", __func__); + LOG_WRN("%s: WARNING: flash attention not supported by %s, memory usage will increase\n", __func__, ggml_backend_name(ctx_clip.backend)); + LOG_WRN("%s: op params: \n", __func__); + static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) { + LOG_WRN("%s: %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n", fn, + name, ggml_type_name(t->type), + t->ne[0], t->ne[1], t->ne[2], t->ne[3], + t->nb[0], t->nb[1], t->nb[2], t->nb[3]); + }; + print_shape(__func__, " dst", op); + print_shape(__func__, "src0", op->src[0]); + print_shape(__func__, "src1", op->src[1]); + print_shape(__func__, "src2", op->src[2]); + LOG_WRN("%s: please report this on github as an issue\n", __func__); + LOG_WRN("%s: *****************************************************************\n", __func__); + ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED; + alloc_compute_meta(ctx_clip, batch); + } + } else { + info = alloc_compute_meta(ctx_clip, batch); + if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__); + } + } + + ctx_clip.is_allocated = true; // mark buffers as allocated + + LOG_INF("%s: flash attention is %s\n", __func__, + (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled"); + + // print ops that are not supported by the GPU backend (if there is one) + if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) { + std::vector<support_info_op> unsupported_ops; + for (const auto & op : info.ops) { + if (!op.is_accel) { + unsupported_ops.push_back(op); + } + } + if (!unsupported_ops.empty()) { + LOG_WRN("%s: *****************************************************************\n", __func__); + LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__); + LOG_WRN("%s: the performance will be suboptimal \n", __func__); + LOG_WRN("%s: list of unsupported ops (backend=%s):\n", __func__, ggml_backend_name(ctx_clip.backend)); + for (const auto & op : unsupported_ops) { + LOG_WRN("%s: %16s: type = %s, ne = [%d %d %d %d]\n", __func__, + ggml_op_name(op.op->op), + ggml_type_name(op.op->type), + op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]); + } + LOG_WRN("%s: flash attention is %s\n", __func__, + (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled"); + LOG_WRN("%s: please report this on github as an issue\n", __func__); + LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__); + LOG_WRN("%s: *****************************************************************\n", __func__); + } + } + } + + static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { + ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); + + ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch); + ggml_backend_sched_reserve(ctx_clip.sched.get(), gf); + + for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) { + ggml_backend_t backend = ctx_clip.backend_ptrs[i]; + ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend); + if (size > 1) { + LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } + } + + const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get()); + const int n_nodes = ggml_graph_n_nodes(gf); + + LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes); + + support_info_graph res { + /*.fattn = */ true, + /*.fattn_op = */ nullptr, + /*.ops = */ {}, + }; + + // check op support + for (int i = 0; i < ggml_graph_n_nodes(gf); i++) { + ggml_tensor * node = ggml_graph_node(gf, i); + res.ops.push_back({node, true}); + if (!ggml_backend_supports_op(ctx_clip.backend, node)) { + res.ops.back().is_accel = false; + if (node->op == GGML_OP_FLASH_ATTN_EXT) { + res.fattn = false; + res.fattn_op = node; + } + } + } + + return res; + } + + void get_bool(const std::string & key, bool & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + output = gguf_get_val_bool(ctx_gguf.get(), i); + } + + void get_i32(const std::string & key, int & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + output = gguf_get_val_i32(ctx_gguf.get(), i); + } + + void get_u32(const std::string & key, int & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + output = gguf_get_val_u32(ctx_gguf.get(), i); + } + + void get_f32(const std::string & key, float & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + output = gguf_get_val_f32(ctx_gguf.get(), i); + } + + void get_string(const std::string & key, std::string & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + output = std::string(gguf_get_val_str(ctx_gguf.get(), i)); + } + + void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + int n = gguf_get_arr_n(ctx_gguf.get(), i); + output.resize(n); + const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i); + for (int i = 0; i < n; ++i) { + output[i] = values[i]; + } + } + + static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) { + auto & hparams = model.hparams; + for (int x = 1; x <= max_patches_per_side; x++) { + for (int y = 1; y <= max_patches_per_side; y++) { + if (x == 1 && y == 1) { + continue; // skip the first point + } + hparams.image_res_candidates.push_back(clip_image_size{ + x*hparams.image_size, + y*hparams.image_size, + }); + } + } + } +}; + +struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) { + clip_ctx * ctx_vision = nullptr; + clip_ctx * ctx_audio = nullptr; + + try { + clip_model_loader loader(fname); + bool skip_audio = false; + + if (loader.has_vision) { + ctx_vision = new clip_ctx(ctx_params); + loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION); + loader.load_tensors(*ctx_vision); + if (ctx_params.warmup) { + loader.warmup(*ctx_vision); + } + + // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors + // we can remove this check when we implement audio support for Gemma 3N + skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV; + + // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f); + } + + if (loader.has_audio && !skip_audio) { + ctx_audio = new clip_ctx(ctx_params); + loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO); + loader.load_tensors(*ctx_audio); + if (ctx_params.warmup) { + loader.warmup(*ctx_audio); + } + } + + } catch (const std::exception & e) { + LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what()); + + delete ctx_vision; + delete ctx_audio; + + return {nullptr, nullptr}; + } + + return {ctx_vision, ctx_audio}; +} + +struct clip_image_size * clip_image_size_init() { + struct clip_image_size * load_image_size = new struct clip_image_size(); + load_image_size->width = 448; + load_image_size->height = 448; + return load_image_size; +} + +struct clip_image_u8 * clip_image_u8_init() { + return new clip_image_u8(); +} + +struct clip_image_f32 * clip_image_f32_init() { + return new clip_image_f32(); +} + +struct clip_image_f32_batch * clip_image_f32_batch_init() { + return new clip_image_f32_batch(); +} + +unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) { + if (nx) *nx = img->nx; + if (ny) *ny = img->ny; + return img->buf.data(); +} + +void clip_image_size_free(struct clip_image_size * load_image_size) { + if (load_image_size == nullptr) { + return; + } + delete load_image_size; +} +void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } +void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } +void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; } +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; } + +size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) { + return batch->entries.size(); +} + +size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return 0; + } + return batch->entries[idx]->nx; +} + +size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return 0; + } + return batch->entries[idx]->ny; +} + +clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return nullptr; + } + return batch->entries[idx].get(); +} + +void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) { + img->nx = nx; + img->ny = ny; + img->buf.resize(3 * nx * ny); + memcpy(img->buf.data(), rgb_pixels, img->buf.size()); +} + +// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not +static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(src.buf.size()); + + // TODO @ngxson : seems like this could be done more efficiently on cgraph + for (size_t i = 0; i < src.buf.size(); ++i) { + int c = i % 3; // rgb + dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c]; + } +} + +// set of tools to manupulate images +// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv +struct img_tool { + enum resize_algo { + RESIZE_ALGO_BILINEAR, + RESIZE_ALGO_BICUBIC, + // RESIZE_ALGO_LANCZOS, // TODO + }; + + static void resize( + const clip_image_u8 & src, + clip_image_u8 & dst, + const clip_image_size & target_resolution, + resize_algo algo, + bool add_padding = true, // TODO: define the behavior for add_padding = false + std::array<uint8_t, 3> pad_color = {0, 0, 0}) { + dst.nx = target_resolution.width; + dst.ny = target_resolution.height; + dst.buf.resize(3 * dst.nx * dst.ny); + + if (dst.nx == src.nx && dst.ny == src.ny) { + // no resize needed, simple copy + dst.buf = src.buf; + return; + } + + if (!add_padding) { + // direct resize + switch (algo) { + case RESIZE_ALGO_BILINEAR: + resize_bilinear(src, dst, target_resolution.width, target_resolution.height); + break; + case RESIZE_ALGO_BICUBIC: + resize_bicubic(src, dst, target_resolution.width, target_resolution.height); + break; + default: + throw std::runtime_error("Unsupported resize algorithm"); + } + } else { + // resize with padding + clip_image_u8 resized_image; + float scale_w = static_cast<float>(target_resolution.width) / src.nx; + float scale_h = static_cast<float>(target_resolution.height) / src.ny; + float scale = std::min(scale_w, scale_h); + int new_width = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width); + int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height); + + switch (algo) { + case RESIZE_ALGO_BILINEAR: + resize_bilinear(src, resized_image, new_width, new_height); + break; + case RESIZE_ALGO_BICUBIC: + resize_bicubic(src, resized_image, new_width, new_height); + break; + default: + throw std::runtime_error("Unsupported resize algorithm"); + } + + // fill dst with pad_color + fill(dst, pad_color); + + int offset_x = (target_resolution.width - new_width) / 2; + int offset_y = (target_resolution.height - new_height) / 2; + + composite(dst, resized_image, offset_x, offset_y); + } + } + + static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { + dst.nx = w; + dst.ny = h; + dst.buf.resize(3 * w * h); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int src_idx = 3 * ((y + i)*image.nx + (x + j)); + int dst_idx = 3 * (i*w + j); + dst.buf[dst_idx] = image.buf[src_idx]; + dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + } + } + } + + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will be aligned to the nearest multiple of align_size + // if H or W size is larger than longest_edge, it will be resized to longest_edge + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) { + GGML_ASSERT(align_size > 0); + if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) { + return {0, 0}; + } + + float scale = std::min(static_cast<float>(longest_edge) / inp_size.width, + static_cast<float>(longest_edge) / inp_size.height); + + float target_width_f = static_cast<float>(inp_size.width) * scale; + float target_height_f = static_cast<float>(inp_size.height) * scale; + + auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; }; + int aligned_width = ceil_by_factor(target_width_f); + int aligned_height = ceil_by_factor(target_height_f); + + return {aligned_width, aligned_height}; + } + + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will have min_pixels <= W*H <= max_pixels + // this is referred as "smart_resize" in transformers code + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) { + GGML_ASSERT(align_size > 0); + const int width = inp_size.width; + const int height = inp_size.height; + + auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::round(x / static_cast<float>(f))) * f; }; + auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; }; + auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; }; + + // always align up first + int h_bar = std::max(align_size, round_by_factor(height)); + int w_bar = std::max(align_size, round_by_factor(width)); + + if (h_bar * w_bar > max_pixels) { + const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels); + h_bar = std::max(align_size, floor_by_factor(height / beta)); + w_bar = std::max(align_size, floor_by_factor(width / beta)); + } else if (h_bar * w_bar < min_pixels) { + const auto beta = std::sqrt(static_cast<float>(min_pixels) / (height * width)); + h_bar = ceil_by_factor(height * beta); + w_bar = ceil_by_factor(width * beta); + } + + return {w_bar, h_bar}; + } + + // draw src image into dst image at offset (offset_x, offset_y) + static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { + for (int y = 0; y < src.ny; ++y) { + for (int x = 0; x < src.nx; ++x) { + int dx = x + offset_x; + int dy = y + offset_y; + // skip pixels that would be out of bounds in the destination + if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) { + continue; + } + size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx)); + size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x)); + dst.buf[dst_idx + 0] = src.buf[src_idx + 0]; + dst.buf[dst_idx + 1] = src.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = src.buf[src_idx + 2]; + } + } + } + + // fill the image with a solid color + static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) { + for (size_t i = 0; i < img.buf.size(); i += 3) { + img.buf[i] = color[0]; + img.buf[i + 1] = color[1]; + img.buf[i + 2] = color[2]; + } + } + +private: + // Bilinear resize function + static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float x_ratio = static_cast<float>(src.nx - 1) / target_width; + float y_ratio = static_cast<float>(src.ny - 1) / target_height; + + for (int y = 0; y < target_height; y++) { + for (int x = 0; x < target_width; x++) { + float px = x_ratio * x; + float py = y_ratio * y; + int x_floor = static_cast<int>(px); + int y_floor = static_cast<int>(py); + float x_lerp = px - x_floor; + float y_lerp = py - y_floor; + + for (int c = 0; c < 3; c++) { + float top = lerp( + static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]), + static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + float bottom = lerp( + static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), + static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp)); + } + } + } + } + + // Bicubic resize function + // part of image will be cropped if the aspect ratio is different + static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float Cc; + float C[5] = {}; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) { + for (j = 0; j < target_width; j++) { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + for (k = 0; k < 3; k++) { + for (jj = 0; jj <= 3; jj++) { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; + } + + static inline int clip(int x, int lower, int upper) { + return std::max(lower, std::min(x, upper)); + } + + // Linear interpolation between two points + static inline float lerp(float s, float e, float t) { + return s + (e - s) * t; + } +}; + +/** + * implementation of LLaVA-UHD: + * - https://arxiv.org/pdf/2403.11703 + * - https://github.com/thunlp/LLaVA-UHD + * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 + * + * overview: + * - an image always have a single overview (downscaled image) + * - an image can have 0 or multiple slices, depending on the image size + * - each slice can then be considered as a separate image + * + * for example: + * + * [overview] --> [slice 1] --> [slice 2] + * | | + * +--> [slice 3] --> [slice 4] + */ +struct llava_uhd { + struct slice_coordinates { + int x; + int y; + clip_image_size size; + }; + + struct slice_instructions { + clip_image_size overview_size; // size of downscaled image + clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size) + clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices + std::vector<slice_coordinates> slices; + + img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR; + bool padding_overview = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) + std::array<uint8_t, 3> pad_color_overview = {0, 0, 0}; + + img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC; + bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) + std::array<uint8_t, 3> pad_color_refined = {0, 0, 0}; + }; + + static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { + slice_instructions res; + const int patch_size = clip_get_patch_size(ctx); + const int slice_size = clip_get_image_size(ctx); + const int original_width = original_size.width; + const int original_height = original_size.height; + + const bool has_slices = original_size.width > slice_size || original_size.height > slice_size; + const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty(); + + if (!has_slices) { + // skip slicing logic + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = clip_image_size{0, 0}; + res.grid_size = clip_image_size{0, 0}; + + return res; + } + + if (has_pinpoints) { + // has pinpoints, use them to calculate the grid size (e.g. llava-1.6) + auto refine_size = llava_uhd::select_best_resolution( + original_size, + ctx->model.hparams.image_res_candidates); + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = refine_size; + res.grid_size = clip_image_size{0, 0}; + res.padding_refined = true; + res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; // preserve old behavior when padding + + LOG_DBG("%s: using pinpoints for slicing\n", __func__); + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n", + __func__, original_width, original_height, + res.overview_size.width, res.overview_size.height, + res.refined_size.width, res.refined_size.height); + + for (int y = 0; y < refine_size.height; y += slice_size) { + for (int x = 0; x < refine_size.width; x += slice_size) { + slice_coordinates slice; + slice.x = x; + slice.y = y; + slice.size.width = std::min(slice_size, refine_size.width - x); + slice.size.height = std::min(slice_size, refine_size.height - y); + res.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", + __func__, (int)res.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + + res.grid_size.height = refine_size.height / slice_size; + res.grid_size.width = refine_size.width / slice_size; + LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height); + + return res; + } + + // no pinpoints, dynamically calculate the grid size (e.g. minicpmv) + + auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices); + res.overview_size = best_size; + + { + const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it + const float log_ratio = log((float)original_width / original_height); + const float ratio = (float)original_width * original_height / (slice_size * slice_size); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); + auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); + res.grid_size = best_grid; + res.refined_size = refine_size; + + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", + __func__, original_width, original_height, + res.overview_size.width, res.overview_size.height, + res.refined_size.width, res.refined_size.height, + res.grid_size.width, res.grid_size.height); + + int width = refine_size.width; + int height = refine_size.height; + int grid_x = int(width / best_grid.width); + int grid_y = int(height / best_grid.height); + for (int patches_y = 0, ic = 0; + patches_y < refine_size.height && ic < best_grid.height; + patches_y += grid_y, ic += 1) { + for (int patches_x = 0, jc = 0; + patches_x < refine_size.width && jc < best_grid.width; + patches_x += grid_x, jc += 1) { + slice_coordinates slice; + slice.x = patches_x; + slice.y = patches_y; + slice.size.width = grid_x; + slice.size.height = grid_y; + res.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", + __func__, (int)res.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + } + + return res; + } + + static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) { + std::vector<clip_image_u8_ptr> output; + + // resize to overview size + clip_image_u8_ptr resized_img(clip_image_u8_init()); + img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview, + inst.padding_overview, inst.pad_color_overview); + output.push_back(std::move(resized_img)); + + if (inst.slices.empty()) { + // no slices, just return the resized image + return output; + } + + // resize to refined size + clip_image_u8_ptr refined_img(clip_image_u8_init()); + img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined, + inst.padding_refined, inst.pad_color_refined); + + // create slices + for (const auto & slice : inst.slices) { + int x = slice.x; + int y = slice.y; + int w = slice.size.width; + int h = slice.size.height; + + clip_image_u8_ptr img_slice(clip_image_u8_init()); + img_tool::crop(*refined_img, *img_slice, x, y, w, h); + output.push_back(std::move(img_slice)); + } + + return output; + } + +private: + static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.width; + int height = original_size.height; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast<float>(width) / height; + height = static_cast<int>(scale_resolution / std::sqrt(r)); + width = static_cast<int>(height * r); + } + clip_image_size res; + res.width = ensure_divide(width, patch_size); + res.height = ensure_divide(height, patch_size); + return res; + } + + static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) { + float scale_width = static_cast<float>(target_max.width) / orig.width; + float scale_height = static_cast<float>(target_max.height) / orig.height; + float scale = std::min(scale_width, scale_height); + return clip_image_size{ + static_cast<int>(orig.width * scale), + static_cast<int>(orig.height * scale), + }; + } + + /** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * For example, when given a list of resolutions: + * - 100x100 + * - 200x100 + * - 100x200 + * - 200x200 + * + * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution). + * + * @param original_size The original size of the image + * @param possible_resolutions A list of possible resolutions + * @return The best fit resolution + */ + static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) { + clip_image_size best_fit; + int min_wasted_area = std::numeric_limits<int>::max(); + int max_effective_resolution = 0; + + for (const clip_image_size & candidate : possible_resolutions) { + auto target_size = resize_maintain_aspect_ratio(original_size, candidate); + int effective_resolution = std::min( + target_size.width * target_size.height, + original_size.width * original_size.height); + int wasted_area = (candidate.width * candidate.height) - effective_resolution; + + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) { + max_effective_resolution = effective_resolution; + min_wasted_area = wasted_area; + best_fit = candidate; + } + + LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution); + } + + return best_fit; + } + + static int ensure_divide(int length, int patch_size) { + return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size); + } + + static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.width; + int height = original_size.height; + int grid_x = grid.width; + int grid_y = grid.height; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + clip_image_size grid_size; + grid_size.width = refine_width / grid_x; + grid_size.height = refine_height / grid_y; + + auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale); + int best_grid_width = best_grid_size.width; + int best_grid_height = best_grid_size.height; + + clip_image_size refine_size; + refine_size.width = best_grid_width * grid_x; + refine_size.height = best_grid_height * grid_y; + return refine_size; + } + + static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector<int> candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector<clip_image_size> candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.push_back(clip_image_size{m, split_grids_nums / m}); + } + ++m; + } + } + + clip_image_size best_grid{1, 1}; + float min_error = std::numeric_limits<float>::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; + } +}; + +// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout) +struct lfm2_vl_image_processor { + // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json + static constexpr int min_tiles = 2; + static constexpr int max_tiles = 10; + static constexpr float max_pixels_tolerance = 2.0f; + static constexpr int tile_size = 512; + + static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { + llava_uhd::slice_instructions inst; + const auto & params = ctx->model.hparams; + const int align_size = params.patch_size * params.n_merge; + + inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR; + inst.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; + inst.overview_size = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels); + + // tile if either dimension exceeds tile_size with tolerance + const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance; + + if (!needs_tiling) { + inst.refined_size = clip_image_size{0, 0}; + inst.grid_size = clip_image_size{0, 0}; + return inst; + } + + const clip_image_size grid = get_grid_layout(original_size.height, original_size.width); + + inst.grid_size = grid; + inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height}; + + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", + __func__, + original_size.width, original_size.height, + inst.overview_size.width, inst.overview_size.height, + inst.refined_size.width, inst.refined_size.height, + grid.width, grid.height); + + for (int row = 0; row < grid.height; row++) { + for (int col = 0; col < grid.width; col++) { + llava_uhd::slice_coordinates slice; + slice.x = col * tile_size; + slice.y = row * tile_size; + slice.size = clip_image_size{tile_size, tile_size}; + inst.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n", + __func__, (int)inst.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + + return inst; + } + +private: + static clip_image_size find_closest_aspect_ratio( + float aspect_ratio, + const std::vector<clip_image_size> & target_ratios, + int width, int height) { + float best_ratio_diff = std::numeric_limits<float>::max(); + clip_image_size best_ratio = {1, 1}; + const float area = static_cast<float>(width * height); + + for (const auto & ratio : target_ratios) { + const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height; + const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio); + if (ratio_diff < best_ratio_diff) { + best_ratio_diff = ratio_diff; + best_ratio = ratio; + } else if (ratio_diff == best_ratio_diff) { + const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height); + if (area > 0.5f * target_area) { + best_ratio = ratio; + } + } + } + return best_ratio; + } + + static std::vector<clip_image_size> get_target_ratios() { + std::vector<clip_image_size> ratios; + for (int n = min_tiles; n <= max_tiles; n++) { + for (int w = 1; w <= n; w++) { + for (int h = 1; h <= n; h++) { + if (w * h >= min_tiles && w * h <= max_tiles) { + bool found = false; + for (const auto & r : ratios) { + if (r.width == w && r.height == h) { + found = true; + break; + } + } + if (!found) { + ratios.push_back({w, h}); + } + } + } + } + } + std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) { + return a.width * a.height < b.width * b.height; + }); + return ratios; + } + + static clip_image_size get_grid_layout(int height, int width) { + const float aspect_ratio = static_cast<float>(width) / height; + const auto ratios = get_target_ratios(); + return find_closest_aspect_ratio(aspect_ratio, ratios, width, height); + } +}; + +// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector +// res_imgs memory is being allocated here, previous allocations will be freed if found +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { + clip_image_size original_size{img->nx, img->ny}; + auto & params = ctx->model.hparams; + + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_MINICPMV: + { + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; + + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + { + GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); + clip_image_u8 resized; + const clip_image_size new_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * 2, + params.image_min_pixels, + params.image_max_pixels); + img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false); + // clip_image_save_to_bmp(resized, "preproc.bmp"); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + // clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); + // res_imgs->data[0] = *res; + res_imgs->entries.push_back(std::move(img_f32)); + } break; + case PROJECTOR_TYPE_YOUTUVL: + { + const int patch_size = params.patch_size; // typically 16 + const int merge_size = params.n_merge; // typically 2 + const int align_size = patch_size * merge_size; // 32 + + const int max_num_patches = params.image_max_pixels > 0 ? + params.image_max_pixels / (patch_size * patch_size) : 256; + + // Linear search for optimal scale to fit within max_num_patches + float scale = 1.0f; + int target_height = original_size.height; + int target_width = original_size.width; + + auto get_scaled_image_size = [align_size](float scale, int size) -> int { + float scaled_size = size * scale; + // Round up to nearest multiple of align_size + int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size; + // Ensure at least one patch + return std::max(align_size, aligned); + }; + + // Linear search with 0.02 step size + while (scale > 0.0f) { + target_height = get_scaled_image_size(scale, original_size.height); + target_width = get_scaled_image_size(scale, original_size.width); + + int num_patches_h = target_height / patch_size; + int num_patches_w = target_width / patch_size; + int num_patches = num_patches_h * num_patches_w; + + if (num_patches > max_num_patches) { + scale -= 0.02f; + } else { + break; + } + } + + clip_image_size new_size = {target_width, target_height}; + + // Resize the image + clip_image_u8 resized; + img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false); + + // Normalize to float32 + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); + + // Add to results + res_imgs->entries.push_back(std::move(img_f32)); + } break; + + case PROJECTOR_TYPE_IDEFICS3: + { + // The refined size has two steps: + // 1. Resize w/ aspect-ratio preserving such that the longer side is + // the preprocessor longest size + // 2. Resize w/out preserving aspect ratio such that both sides are + // multiples of image_size (always rounding up) + // + // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 + const clip_image_size refined_size = img_tool::calc_size_preserved_ratio( + original_size, params.image_size, params.image_longest_edge); + // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", + // __func__, original_size.width, original_size.height, + // refined_size.width, refined_size.height); + + llava_uhd::slice_instructions instructions; + instructions.overview_size = clip_image_size{params.image_size, params.image_size}; + instructions.refined_size = refined_size; + instructions.grid_size = clip_image_size{ + static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)), + static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)), + }; + for (int y = 0; y < refined_size.height; y += params.image_size) { + for (int x = 0; x < refined_size.width; x += params.image_size) { + // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y); + instructions.slices.push_back(llava_uhd::slice_coordinates{ + /* x */x, + /* y */y, + /* size */clip_image_size{ + std::min(params.image_size, refined_size.width - x), + std::min(params.image_size, refined_size.height - y) + } + }); + } + } + auto imgs = llava_uhd::slice_image(img, instructions); + + // cast and normalize to f32 + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + + res_imgs->grid_x = instructions.grid_size.width; + res_imgs->grid_y = instructions.grid_size.height; + } break; + + case PROJECTOR_TYPE_GLM_EDGE: + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution + { + clip_image_u8 resized_image; + int sz = params.image_size; + img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + //clip_image_save_to_bmp(resized_image, "resized.bmp"); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; + + case PROJECTOR_TYPE_GEMMA3NV: + { + clip_image_u8 resized_image; + int sz = params.image_size; + img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; + + case PROJECTOR_TYPE_JANUS_PRO: + { + // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384 + const std::array<uint8_t, 3> pad_color = {127, 127, 127}; + clip_image_u8 resized_image; + int sz = params.image_size; + img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; + + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + { + GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); + clip_image_u8 resized_image; + // the original pixtral model doesn't have n_merge + const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge; + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * cur_merge, + params.image_min_pixels, + params.image_max_pixels); + img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; + + case PROJECTOR_TYPE_LLAMA4: + { + GGML_ASSERT(!params.image_res_candidates.empty()); + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; + + case PROJECTOR_TYPE_LFM2: + { + auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size); + std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; + + case PROJECTOR_TYPE_KIMIVL: + { + GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * params.n_merge, + params.image_min_pixels, + params.image_max_pixels); + const std::array<uint8_t, 3> pad_color = {122, 116, 104}; + + clip_image_u8 resized_img; + img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } break; + + case PROJECTOR_TYPE_KIMIK25: + { + GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * params.n_merge, + params.image_min_pixels, + params.image_max_pixels); + const std::array<uint8_t, 3> pad_color = {0, 0, 0}; + + clip_image_u8 resized_img; + img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } break; + + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm? + { + // TODO @ngxson : refactor the code below to avoid duplicated logic + + // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + + clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily + + // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing + if (params.image_res_candidates.empty()) { // pad_to_square + // for llava-1.5, we resize image to a square, and pad the shorter side with a background color + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + const int longer_side = std::max(img->nx, img->ny); + temp->nx = longer_side; + temp->ny = longer_side; + temp->buf.resize(3 * longer_side * longer_side); + + // background color in RGB from LLaVA (this is the mean rgb color * 255) + const std::array<uint8_t, 3> pad_color = {122, 116, 104}; + + // resize the image to the target_size + img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); + + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + + } else { + // "spatial_unpad" with "anyres" processing for llava-1.6 + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + } + } break; + + default: + LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type()); + return false; + } + + return true; +} + +ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { + return ctx->model.image_newline; +} + +void clip_free(clip_ctx * ctx) { + if (ctx == nullptr) { + return; + } + delete ctx; +} + +// deprecated +size_t clip_embd_nbytes(const struct clip_ctx * ctx) { + const int32_t nx = ctx->model.hparams.image_size; + const int32_t ny = ctx->model.hparams.image_size; + return clip_embd_nbytes_by_img(ctx, nx, ny); +} + +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) { + clip_image_f32 img; + img.nx = img_w; + img.ny = img_h; + return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); +} + +int32_t clip_get_image_size(const struct clip_ctx * ctx) { + return ctx->model.hparams.image_size; +} + +int32_t clip_get_patch_size(const struct clip_ctx * ctx) { + return ctx->model.hparams.patch_size; +} + +int32_t clip_get_hidden_size(const struct clip_ctx * ctx) { + return ctx->model.hparams.n_embd; +} + +const char * clip_patch_merge_type(const struct clip_ctx * ctx) { + return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat"; +} + +int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->model.hparams; + const int n_total = clip_n_output_tokens(ctx, img); + const auto & proj = ctx->proj_type(); + switch (proj) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + case PROJECTOR_TYPE_YOUTUVL: + return (img->nx / params.patch_size) / 2; + default: + break; + } + return n_total; +} + +int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->model.hparams; + const auto & proj = ctx->proj_type(); + switch (proj) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + case PROJECTOR_TYPE_YOUTUVL: + return (img->ny / params.patch_size) / 2; + default: + break; + } + return 1; +} + +int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->model.hparams; + + // for models with fixed size image, the input image is already pre-processed and resized to square + int patch_size = params.patch_size; + int n_patches = (img->nx / patch_size) * (img->ny / patch_size); + + projector_type proj = ctx->proj_type(); + + switch (proj) { + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_JANUS_PRO: + { + // do nothing + } break; + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_GLM_EDGE: + { + n_patches /= 4; + if (ctx->model.mm_boi) { + n_patches += 2; // for BOI and EOI token embeddings + } + } break; + case PROJECTOR_TYPE_MINICPMV: + { + // Use actual config value if available, otherwise fall back to hardcoded values + if (params.minicpmv_query_num > 0) { + n_patches = params.minicpmv_query_num; + } else { + // Fallback to hardcoded values for legacy models + if (params.minicpmv_version == 2) { + n_patches = 96; + } else if (params.minicpmv_version == 3) { + n_patches = 64; + } else if (params.minicpmv_version == 4) { + n_patches = 64; + } else if (params.minicpmv_version == 5) { + // MiniCPM-V 4.0 + n_patches = 64; + } else if (params.minicpmv_version == 6) { + // MiniCPM-V 4.5 + n_patches = 64; + } else if (params.minicpmv_version == 100045) { + // MiniCPM-o 4.5 + n_patches = 64; + } else { + GGML_ABORT("Unknown minicpmv version"); + } + } + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + case PROJECTOR_TYPE_YOUTUVL: + { + // dynamic size (2 conv, so double patch size) + int x_patch = img->nx / (params.patch_size * 2); + int y_patch = img->ny / (params.patch_size * 2); + n_patches = x_patch * y_patch; + } break; + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_LLAMA4: + { + // both X and Y are downscaled by the scale factor + int scale_factor = ctx->model.hparams.n_merge; + n_patches /= (scale_factor * scale_factor); + } break; + case PROJECTOR_TYPE_GEMMA3NV: + { + // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution + // regardless of input size (see architecture description) + n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size; + } break; + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_KIMIVL: + case PROJECTOR_TYPE_KIMIK25: + { + // dynamic size + int out_patch_size = params.patch_size * ctx->model.hparams.n_merge; + int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; + int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; + n_patches = x_patch * y_patch; + } break; + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + { + // dynamic size + int n_merge = ctx->model.hparams.n_merge; + int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); + if (ctx->model.token_embd_img_break) { + n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row + } else { + n_patches = n_patches_y * n_patches_x; + } + } break; + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + { + n_patches = img->nx; + + const int proj_stack_factor = ctx->model.hparams.proj_stack_factor; + if (ctx->model.audio_has_stack_frames()) { + GGML_ASSERT(proj_stack_factor > 0); + const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor); + n_patches = n_len / proj_stack_factor; + } + + // whisper downscales input token by half after conv1d + n_patches /= 2; + + if (ctx->model.audio_has_avgpool()) { + // divide by 2 because of nn.AvgPool1d(2, stride=2) + n_patches /= 2; + } + } break; + case PROJECTOR_TYPE_GLMA: + { + n_patches = img->nx; + // whisper downscales input token by half after conv1d + n_patches /= 2; + // reshape by merge_factor + n_patches /= ctx->model.hparams.proj_stack_factor; + // for BOI and EOI token embeddings + n_patches += 2; + } break; + case PROJECTOR_TYPE_COGVLM: + { + n_patches += 2; // for BOI and EOI token embeddings + } break; + case PROJECTOR_TYPE_LFM2A: + { + n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; + } break; + default: + GGML_ABORT("unsupported projector type"); + } + + return n_patches; +} + +bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { + clip_image_f32_batch imgs; + clip_image_f32_ptr img_copy(clip_image_f32_init()); + *img_copy = *img; + imgs.entries.push_back(std::move(img_copy)); + + return clip_image_batch_encode(ctx, n_threads, &imgs, vec); +} + +bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { + const clip_image_f32_batch & imgs = *imgs_c_ptr; + int batch_size = imgs.entries.size(); + + // TODO @ngxson : implement batch size > 1 as a loop + // we don't need true batching support because the cgraph will gonna be big anyway + if (batch_size != 1) { + return false; // only support batch size of 1 + } + + // if buffers are not allocated, we need to do a warmup run to allocate them + if (!ctx->is_allocated) { + clip_model_loader::warmup(*ctx, *imgs_c_ptr); + } + + // build the inference graph + ggml_backend_sched_reset(ctx->sched.get()); + ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); + ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); + + // set inputs + const auto & model = ctx->model; + const auto & hparams = model.hparams; + + const int image_size_width = imgs.entries[0]->nx; + const int image_size_height = imgs.entries[0]->ny; + + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int n_pos = num_patches + (model.class_embedding ? 1 : 0); + const int pos_w = image_size_width / patch_size; + const int pos_h = image_size_height / patch_size; + + + auto get_inp_tensor = [&gf](const char * name) { + ggml_tensor * inp = ggml_graph_get_tensor(gf, name); + if (inp == nullptr) { + GGML_ABORT("Failed to get tensor %s", name); + } + if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) { + GGML_ABORT("Tensor %s is not an input tensor", name); + } + return inp; + }; + + auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + // set input pixel values + if (!imgs.is_audio) { + size_t nelem = 0; + for (const auto & img : imgs.entries) { + nelem += img->nx * img->ny * 3; + } + std::vector<float> inp_raw(nelem); + + // layout of data (note: the channel dim is unrolled to better visualize the layout): + // + // ┌──W──┐ + // │ H │ channel = R + // ├─────┤ │ + // │ H │ channel = G + // ├─────┤ │ + // │ H │ channel = B + // └─────┘ │ + // ──────┘ x B + + for (size_t i = 0; i < imgs.entries.size(); i++) { + const int nx = imgs.entries[i]->nx; + const int ny = imgs.entries[i]->ny; + const int n = nx * ny; + + for (int b = 0; b < batch_size; b++) { + float * batch_entry = inp_raw.data() + b * (3*n); + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + size_t base_src = 3*(y * nx + x); // idx of the first channel + size_t base_dst = y * nx + x; // idx of the first channel + batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; + batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; + batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; + } + } + } + } + set_input_f32("inp_raw", inp_raw); + + } else { + // audio input + GGML_ASSERT(imgs.entries.size() == 1); + const auto & mel_inp = imgs.entries[0]; + const int n_step = mel_inp->nx; + const int n_mel = mel_inp->ny; + std::vector<float> inp_raw(n_step * n_mel); + std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float)); + set_input_f32("inp_raw", inp_raw); + } + + // set input per projector + switch (ctx->model.proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + std::vector<int32_t> positions(pos_h * pos_w); + int bucket_coords_h[1024]; + int bucket_coords_w[1024]; + for (int i = 0; i < pos_h; i++){ + bucket_coords_h[i] = std::floor(70.0*i/pos_h); + } + for (int i = 0; i < pos_w; i++){ + bucket_coords_w[i] = std::floor(70.0*i/pos_w); + } + for (int i = 0, id = 0; i < pos_h; i++){ + for (int j = 0; j < pos_w; j++){ + positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + } + } + set_input_i32("positions", positions); + + // inputs for resampler projector + // set the 2D positions (using float for sinusoidal embedding) + int n_patches_per_col = image_size_width / patch_size; + std::vector<float> pos_data(n_pos); + // dimension H + for (int i = 0; i < n_pos; i++) { + pos_data[i] = static_cast<float>(i / n_patches_per_col); + } + set_input_f32("pos_h", pos_data); + // dimension W + for (int i = 0; i < n_pos; i++) { + pos_data[i] = static_cast<float>(i % n_patches_per_col); + } + set_input_f32("pos_w", pos_data); + // base frequency omega + const float base_freq = 10000.0f; + const int n_embd_proj = clip_n_mmproj_embd(ctx); + std::vector<float> omega(n_embd_proj / 4); + for (int i = 0; i < n_embd_proj / 4; ++i) { + omega[i] = 1.0f / std::pow(base_freq, static_cast<float>(i) / (n_embd_proj / 4)); + } + set_input_f32("omega", omega); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + { + const int merge_ratio = hparams.n_merge; + const int pw = image_size_width / patch_size; + const int ph = image_size_height / patch_size; + std::vector<int> positions(n_pos * 4); + int ptr = 0; + for (int y = 0; y < ph; y += merge_ratio) { + for (int x = 0; x < pw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + positions[ ptr] = y + dy; + positions[ num_patches + ptr] = x + dx; + positions[2 * num_patches + ptr] = y + dy; + positions[3 * num_patches + ptr] = x + dx; + ptr++; + } + } + } + } + + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_YOUTUVL: + { + // pw * ph = number of tokens output by ViT after apply patch merger + // ipw * ipw = number of vision token been processed inside ViT + const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty(); + const int merge_ratio = 2; + const int pw = image_size_width / patch_size / merge_ratio; + const int ph = image_size_height / patch_size / merge_ratio; + const int ipw = image_size_width / patch_size; + const int iph = image_size_height / patch_size; + + std::vector<int> idx (ph * pw); + std::vector<int> inv_idx(ph * pw); + + if (use_window_attn) { + const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112; + const int grid_window = attn_window_size / patch_size / merge_ratio; + int dst = 0; + // [num_vision_tokens, num_vision_tokens] attention mask tensor + std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest()); + int mask_row = 0; + + for (int y = 0; y < ph; y += grid_window) { + for (int x = 0; x < pw; x += grid_window) { + const int win_h = std::min(grid_window, ph - y); + const int win_w = std::min(grid_window, pw - x); + const int dst_0 = dst; + // group all tokens belong to the same window togather (to a continue range) + for (int dy = 0; dy < win_h; dy++) { + for (int dx = 0; dx < win_w; dx++) { + const int src = (y + dy) * pw + (x + dx); + GGML_ASSERT(src < (int)idx.size()); + GGML_ASSERT(dst < (int)inv_idx.size()); + idx [src] = dst; + inv_idx[dst] = src; + dst++; + } + } + + for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { + int row_offset = mask_row * (ipw * iph); + std::fill( + mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), + mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), + 0.0); + mask_row++; + } + } + } + + set_input_i32("window_idx", idx); + set_input_i32("inv_window_idx", inv_idx); + set_input_f32("window_mask", mask); + } else { + for (int i = 0; i < ph * pw; i++) { + idx[i] = i; + } + } + + const int mpow = merge_ratio * merge_ratio; + std::vector<int> positions(n_pos * 4); + + int ptr = 0; + for (int y = 0; y < iph; y += merge_ratio) { + for (int x = 0; x < ipw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + auto remap = idx[ptr / mpow]; + remap = (remap * mpow) + (ptr % mpow); + + positions[ remap] = y + dy; + positions[ num_patches + remap] = x + dx; + positions[2 * num_patches + remap] = y + dy; + positions[3 * num_patches + remap] = x + dx; + ptr++; + } + } + } + } + + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_KIMIVL: + case PROJECTOR_TYPE_KIMIK25: + case PROJECTOR_TYPE_LIGHTONOCR: + { + // set the 2D positions + int n_patches_per_col = image_size_width / patch_size; + std::vector<int> pos_data(n_pos); + // dimension H + for (int i = 0; i < n_pos; i++) { + pos_data[i] = i / n_patches_per_col; + } + set_input_i32("pos_h", pos_data); + // dimension W + for (int i = 0; i < n_pos; i++) { + pos_data[i] = i % n_patches_per_col; + } + set_input_i32("pos_w", pos_data); + } break; + case PROJECTOR_TYPE_GLM_EDGE: + { + // llava and other models + std::vector<int32_t> positions(n_pos); + for (int i = 0; i < n_pos; i++) { + positions[i] = i; + } + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + { + // llava and other models + std::vector<int32_t> positions(n_pos); + for (int i = 0; i < n_pos; i++) { + positions[i] = i; + } + set_input_i32("positions", positions); + + // The patches vector is used to get rows to index into the embeds with; + // we should skip dim 0 only if we have CLS to avoid going out of bounds + // when retrieving the rows. + int patch_offset = model.class_embedding ? 1 : 0; + std::vector<int32_t> patches(num_patches); + for (int i = 0; i < num_patches; i++) { + patches[i] = i + patch_offset; + } + set_input_i32("patches", patches); + } break; + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_GEMMA3NV: + case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + case PROJECTOR_TYPE_JANUS_PRO: + case PROJECTOR_TYPE_COGVLM: + { + // do nothing + } break; + case PROJECTOR_TYPE_LLAMA4: + { + // set the 2D positions + int n_patches_per_col = image_size_width / patch_size; + std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token + // last pos is always kept 0, it's for CLS + // dimension H + for (int i = 0; i < num_patches; i++) { + pos_data[i] = (i / n_patches_per_col) + 1; + } + set_input_i32("pos_h", pos_data); + // dimension W + for (int i = 0; i < num_patches; i++) { + pos_data[i] = (i % n_patches_per_col) + 1; + } + set_input_i32("pos_w", pos_data); + } break; + case PROJECTOR_TYPE_LFM2A: + { + GGML_ASSERT(imgs.entries.size() == 1); + const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get()); + + auto d_model = 512; + auto seq_len = n_frames * 2 - 1; + std::vector<float> pos_emb(d_model*seq_len); + std::vector<double> inv_freq(d_model / 2); + for (size_t i = 0; i < inv_freq.size(); ++i) { + inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i))); + } + for (int64_t pos = 0; pos < seq_len; ++pos) { + for (size_t i = 0; i < inv_freq.size(); ++i) { + const float ang = (n_frames - pos - 1) * inv_freq[i]; + pos_emb[pos*d_model + 2*i + 0] = sinf(ang); // even + pos_emb[pos*d_model + 2*i + 1] = cosf(ang); // odd + } + } + set_input_f32("pos_emb", pos_emb); + } break; + default: + GGML_ABORT("Unknown projector type"); + } + + // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); + ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (reg) { + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn) { + ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads); + } + } + + auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status); + return false; + } + + // the last node is the embedding tensor + ggml_tensor * embeddings = ggml_graph_node(gf, -1); + + // sanity check (only support batch size of 1 for now) + const int n_tokens_out = embeddings->ne[1]; + const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get()); + if (n_tokens_out != expected_n_tokens_out) { + LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out); + GGML_ABORT("Invalid number of output tokens"); + } + + // copy the embeddings to the location passed by the user + if (vec != nullptr) { + ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + } + + // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set + if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) { + const int64_t n_embd = embeddings->ne[0]; + const int64_t n_tokens = embeddings->ne[1]; + std::vector<float> emb_data(n_embd * n_tokens); + ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings)); + + LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n"); + LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens); + + // Print first few values of first token + LOG_INF("Token 0 (first 16 values): "); + for (int i = 0; i < std::min((int64_t)16, n_embd); i++) { + LOG_INF("%.6f ", emb_data[i]); + } + LOG_INF("\n"); + + // Print last few values of first token + if (n_embd > 16) { + LOG_INF("Token 0 (last 16 values): "); + for (int64_t i = n_embd - 16; i < n_embd; i++) { + LOG_INF("%.6f ", emb_data[i]); + } + LOG_INF("\n"); + } + + // Compute and print statistics + float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0]; + for (size_t i = 0; i < emb_data.size(); i++) { + sum += emb_data[i]; + sum_sq += emb_data[i] * emb_data[i]; + min_val = std::min(min_val, emb_data[i]); + max_val = std::max(max_val, emb_data[i]); + } + float mean = sum / emb_data.size(); + float variance = (sum_sq / emb_data.size()) - (mean * mean); + LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n", + mean, sqrtf(variance), min_val, max_val, sum); + LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n"); + } + + return true; +} + +int clip_n_mmproj_embd(const struct clip_ctx * ctx) { + switch (ctx->model.proj_type) { + case PROJECTOR_TYPE_LDP: + return ctx->model.mm_model_block_1_block_2_1_b->ne[0]; + case PROJECTOR_TYPE_LDPV2: + return ctx->model.mm_model_peg_0_b->ne[0]; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_MLP_NORM: + return ctx->model.mm_3_b->ne[0]; + case PROJECTOR_TYPE_MINICPMV: + return ctx->model.mm_model_proj->ne[0]; + case PROJECTOR_TYPE_GLM_EDGE: + return ctx->model.mm_model_mlp_3_w->ne[1]; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_JANUS_PRO: + case PROJECTOR_TYPE_YOUTUVL: + return ctx->model.mm_1_b->ne[0]; + case PROJECTOR_TYPE_QWEN3VL: + // main path + deepstack paths + return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers); + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_GEMMA3NV: + return ctx->model.mm_input_proj_w->ne[0]; + case PROJECTOR_TYPE_IDEFICS3: + return ctx->model.projection->ne[1]; + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_INTERNVL: + return ctx->model.mm_3_w->ne[1]; + case PROJECTOR_TYPE_LLAMA4: + return ctx->model.mm_model_proj->ne[1]; + case PROJECTOR_TYPE_QWEN2A: + return ctx->model.mm_fc_w->ne[1]; + case PROJECTOR_TYPE_GLMA: + return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_KIMIVL: + case PROJECTOR_TYPE_KIMIK25: + return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_COGVLM: + return ctx->model.mm_4h_to_h_w->ne[1]; + case PROJECTOR_TYPE_LFM2A: + return ctx->model.position_embeddings->ne[0]; + case PROJECTOR_TYPE_GLM4V: + return ctx->model.mm_ffn_down_w->ne[1]; + default: + GGML_ABORT("Unknown projector type"); + } +} + +int clip_is_minicpmv(const struct clip_ctx * ctx) { + // TODO: remove this function + if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) { + return ctx->model.hparams.minicpmv_version; + } + return 0; +} + +bool clip_is_glm(const struct clip_ctx * ctx) { + // TODO: remove this function + return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE; +} + +bool clip_is_llava(const struct clip_ctx * ctx) { + return ctx->model.hparams.has_llava_projector; +} + +bool clip_has_vision_encoder(const struct clip_ctx * ctx) { + return ctx->model.modality == CLIP_MODALITY_VISION; +} + +bool clip_has_audio_encoder(const struct clip_ctx * ctx) { + return ctx->model.modality == CLIP_MODALITY_AUDIO; +} + +bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + return true; + default: + return false; + } +} + +bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { + clip_image_f32 clip_img; + clip_img.buf.resize(h * w * 3); + for (int i = 0; i < h*w*3; i++) + { + clip_img.buf[i] = img[i]; + } + clip_img.nx = w; + clip_img.ny = h; + clip_image_encode(ctx, n_threads, &clip_img, vec); + return true; +} + +// +// API used internally with mtmd +// + +projector_type clip_get_projector_type(const struct clip_ctx * ctx) { + return ctx->proj_type(); +} + +void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) { + clip_image_f32 * audio = new clip_image_f32; + audio->nx = n_frames; + audio->ny = n_mel; + audio->buf.resize(n_frames * n_mel); + std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float)); + + batch->entries.push_back(clip_image_f32_ptr(audio)); + batch->is_audio = true; +} + +const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { + return &ctx->model.hparams; +} + +// +// API for debugging +// +void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) { + clip_image_f32 img; + img.nx = w; + img.ny = h; + img.buf.resize(h * w * 3); + for (int i = 0; i < h * w * 3; i++) { + img.buf[i] = static_cast<float>(fill_value); + } + clip_image_encode(ctx, 1, &img, nullptr); + GGML_ASSERT(img.buf.empty() && "expected, always stop here"); +} diff --git a/llama.cpp/tools/mtmd/clip.h b/llama.cpp/tools/mtmd/clip.h new file mode 100644 index 0000000..71b5848 --- /dev/null +++ b/llama.cpp/tools/mtmd/clip.h @@ -0,0 +1,121 @@ +#pragma once + +#include "ggml.h" +#include "mtmd.h" + +#include <stddef.h> +#include <stdint.h> + +// !!! Internal header, to be used by mtmd only !!! + +#define MTMD_INTERNAL_HEADER + +struct clip_ctx; + +struct clip_image_size { + int width; + int height; +}; + +struct clip_image_f32; +struct clip_image_u8_batch; +struct clip_image_f32_batch; + +enum clip_modality { + CLIP_MODALITY_VISION, + CLIP_MODALITY_AUDIO, +}; + +enum clip_flash_attn_type { + CLIP_FLASH_ATTN_TYPE_AUTO = -1, + CLIP_FLASH_ATTN_TYPE_DISABLED = 0, + CLIP_FLASH_ATTN_TYPE_ENABLED = 1, +}; + +struct clip_context_params { + bool use_gpu; + enum clip_flash_attn_type flash_attn_type; + int image_min_tokens; + int image_max_tokens; + bool warmup; + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; +}; + +struct clip_init_result { + struct clip_ctx * ctx_v; // vision context + struct clip_ctx * ctx_a; // audio context +}; + +struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params); + +void clip_free(struct clip_ctx * ctx); + +size_t clip_embd_nbytes(const struct clip_ctx * ctx); +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h); + +int32_t clip_get_image_size (const struct clip_ctx * ctx); +int32_t clip_get_patch_size (const struct clip_ctx * ctx); +int32_t clip_get_hidden_size(const struct clip_ctx * ctx); + +// TODO: should be enum, not string +const char * clip_patch_merge_type(const struct clip_ctx * ctx); + +int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// for M-RoPE, this will be the number of token positions in X and Y directions +// for other models, X will be the total number of tokens and Y will be 1 +int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img); +int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// this should be equal to the embedding dimension of the text model +int clip_n_mmproj_embd(const struct clip_ctx * ctx); + +struct clip_image_size * clip_image_size_init(void); +struct clip_image_u8 * clip_image_u8_init (void); +struct clip_image_f32 * clip_image_f32_init(void); +struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava + +// nx, ny are the output image dimensions +unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny); + +void clip_image_size_free (struct clip_image_size * img_size); +void clip_image_u8_free (struct clip_image_u8 * img); +void clip_image_f32_free(struct clip_image_f32 * img); +void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); + +// use for accessing underlay data of clip_image_f32_batch +size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size() +size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx +size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny +struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data + +/** + * Build image from pixels decoded by other libraries instead of stb_image.h for better performance. + * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes + */ +void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); + +/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ +bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); + +struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); + +bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); +bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); + +int clip_is_minicpmv(const struct clip_ctx * ctx); +bool clip_is_glm(const struct clip_ctx * ctx); +bool clip_is_llava(const struct clip_ctx * ctx); +// note for contributor: this clip_is_(model) pattern is deprecated +// do NOT add new functions like this + +bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); + +// use by audio input +void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel); + +bool clip_has_vision_encoder(const struct clip_ctx * ctx); +bool clip_has_audio_encoder(const struct clip_ctx * ctx); +bool clip_has_whisper_encoder(const struct clip_ctx * ctx); diff --git a/llama.cpp/tools/mtmd/deprecation-warning.cpp b/llama.cpp/tools/mtmd/deprecation-warning.cpp new file mode 100644 index 0000000..dded0a5 --- /dev/null +++ b/llama.cpp/tools/mtmd/deprecation-warning.cpp @@ -0,0 +1,22 @@ +#include <cstdio> +#include <string> + +int main(int argc, char** argv) { + std::string filename = "main"; + if (argc >= 1) { + filename = argv[0]; + } + + // Get only the program name from the full path + size_t pos = filename.find_last_of("/\\"); + if (pos != std::string::npos) { + filename = filename.substr(pos+1); + } + + fprintf(stdout, "\n"); + fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str()); + fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n"); + fprintf(stdout, "\n"); + + return EXIT_FAILURE; +} diff --git a/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py b/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py new file mode 100644 index 0000000..2949fae --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py @@ -0,0 +1,412 @@ +import argparse +import os +import json +import re + +import torch +import numpy as np +from gguf import * +from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel + +TEXT = "clip.text" +VISION = "clip.vision" + + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + # Standardize the transformers llava next keys for + # image newline / mm projector with the classes in haotian-liu LLaVA + if name == "image_newline": + return "model.image_newline" + if name.startswith("multi_modal_projector"): + name = name.replace("multi_modal_projector", "mm") + if "linear_1" in name: + name = name.replace("linear_1", "0") + if "linear_2" in name: + name = name.replace("linear_2", "2") + return name + + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") + +# Selectable visual encoders that are compatible with this script +encoder_group = ap.add_mutually_exclusive_group() +encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False, + help="the visual encoder is Siglip.") + +ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.48145466, 0.4578275, 0.40821073] +default_image_std = [0.26862954, 0.26130258, 0.27577711] +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) + +# with proper +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +if ( + args.clip_model_is_vision or + not os.path.exists(dir_model + "/vocab.json") or + args.clip_model_is_openclip or + args.clip_model_is_siglip +): + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + config = json.load(f) + if args.clip_model_is_vision: + v_hparams = config + t_hparams = None + else: + v_hparams = config["vision_config"] + t_hparams = config["text_config"] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +if args.clip_model_is_siglip: + model = SiglipVisionModel.from_pretrained(dir_model) + processor = None +elif args.clip_model_is_vision or args.clip_model_is_openclip: + model = CLIPVisionModel.from_pretrained(dir_model) + processor = None +else: + model = CLIPModel.from_pretrained(dir_model) + processor = CLIPProcessor.from_pretrained(dir_model) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_llava_projector = False +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.llava_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_llava_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG) + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_llava_projector", has_llava_projector) +fout.add_file_type(ftype) +model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model) +fout.add_name(model_name) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_llava_projector: + fout.add_description("vision-only CLIP model") +elif has_llava_projector: + fout.add_description("image encoder for LLaVA") + # add projector type + fout.add_string("clip.projector_type", args.projector_type) +else: + fout.add_description("two-tower CLIP model") + +if has_text_encoder: + assert t_hparams is not None + assert tokens is not None + if args.clip_model_is_siglip: + text_projection_dim = 0 + else: + text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"]) + # text_model hparams + fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) + fout.add_uint32("clip.text.projection_dim", text_projection_dim) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) + fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) + fout.add_token_list(tokens) + + + +def get_non_negative_vision_feature_layers(v_hparams): + """ + Determine the vision feature layer(s) for the llava model, which are indices into the + hidden states of the visual encoder. Note that the hidden states array generally takes the + form: + + [<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>] + + so feature indices should be offset as n+1 to get the output of encoder block n. + We convert all vision feature layers to non-negative so that -1 can be used in + the model as an unset value. If no vision feature layer is found, we leave it unset. + """ + num_hidden_layers = v_hparams["num_hidden_layers"] + to_non_negative = lambda layer_idx: layer_idx if layer_idx >= 0 else num_hidden_layers + layer_idx + 1 + feature_layers_key = None + # Key used for llava models in transformers + if "vision_feature_layer" in config: + feature_layers_key = "vision_feature_layer" + # Key used for llava models in the original format + elif "mm_vision_select_layer" in config: + feature_layers_key = "mm_vision_select_layer" + if feature_layers_key is not None: + feature_layers = config[feature_layers_key] + if isinstance(feature_layers, int): + feature_layers = [feature_layers] + return [to_non_negative(feature_layer) for feature_layer in feature_layers] + +# Determine if we have explicitly specified vision feature layers in our config +feature_layers = get_non_negative_vision_feature_layers(v_hparams) + +if has_vision_encoder: + # Siglip does not have a visual projector; set projection dim to 0 + if args.clip_model_is_siglip: + visual_projection_dim = 0 + else: + visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"]) + + # set vision_model hparams + fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) + fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) + fout.add_uint32("clip.vision.projection_dim", visual_projection_dim) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) + if feature_layers: + block_count = max(feature_layers) + else: + block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + # /** + # "image_grid_pinpoints": [ + # [ + # 336, + # 672 + # ], + # [ + # 672, + # 336 + # ], + # [ + # 672, + # 672 + # ], + # [ + # 1008, + # 336 + # ], + # [ + # 336, + # 1008 + # ] + # ], + # Flattened: + # [ + # 336, 672, + # 672, 336, + # 672, 672, + # 1008, 336, + # 336, 1008 + # ] + # * + # */ + if "image_grid_pinpoints" in v_hparams: + # flatten it + image_grid_pinpoints = [] + for pinpoint in v_hparams["image_grid_pinpoints"]: + for p in pinpoint: + image_grid_pinpoints.append(p) + fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) + if "image_crop_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) + if "image_aspect_ratio" in v_hparams: + fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) + if "image_split_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) + if "mm_patch_merge_type" in v_hparams: + fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) + if "mm_projector_type" in v_hparams: + fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) + if feature_layers: + fout.add_array("clip.vision.feature_layer", feature_layers) + + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue] + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std # pyright: ignore[reportAttributeAccessIssue] + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +use_gelu = v_hparams["hidden_act"] == "gelu" +fout.add_bool("clip.use_gelu", use_gelu) + + +if has_llava_projector: + # By default, we drop the last layer for llava projector + # models unless we have explicitly set vision feature layers + if feature_layers is None: + model.vision_model.encoder.layers.pop(-1) + else: + model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)] + + projector = torch.load(args.llava_projector) + for name, data in projector.items(): + name = get_tensor_name(name) + # pw and dw conv ndim==4 + if data.ndim == 2 or data.ndim == 4: + data = data.squeeze().numpy().astype(np.float16) + else: + data = data.squeeze().numpy().astype(np.float32) + + fout.add_tensor(name, data) + + print("Projector tensors added\n") + +state_dict = model.state_dict() +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py b/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py new file mode 100644 index 0000000..848ef1c --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py @@ -0,0 +1,280 @@ +import argparse +import os +import json +import re + +import torch +import numpy as np +from gguf import * + +TEXT = "clip.text" +VISION = "clip.vision" +from transformers import SiglipVisionModel, SiglipVisionConfig + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if name in ( + "vision_model.head.probe", + "vision_model.head.attention.in_proj_weight", + "vision_model.head.attention.in_proj_bias", + "vision_model.head.attention.out_proj.weight", + "vision_model.head.attention.out_proj.bias", + "vision_model.head.layernorm.weight", + "vision_model.head.layernorm.bias", + "vision_model.head.mlp.fc1.weight", + "vision_model.head.mlp.fc1.bias", + "vision_model.head.mlp.fc2.weight", + "vision_model.head.mlp.fc2.bias" + ): + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2","adapter"], default="adapter") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.5, 0.5, 0.5] +default_image_std = [0.5, 0.5, 0.5] +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) + +# with proper +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + config = json.load(f) + if args.clip_model_is_vision: + v_hparams = config + t_hparams = None + else: + v_hparams = config["vision_config"] + t_hparams = None + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +vision_config = SiglipVisionConfig(**v_hparams) +model = SiglipVisionModel(vision_config) +model.load_state_dict(torch.load(os.path.join(dir_model, "glm.clip"))) + +fname_middle = None +has_text_encoder = False +has_vision_encoder = True +has_glm_projector = True +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.llava_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_glm_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_glm_projector", has_glm_projector) +fout.add_file_type(ftype) +model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model) +fout.add_name(model_name) +if has_glm_projector: + fout.add_description("image encoder for glm4v") + fout.add_string("clip.projector_type", "adapter") +else: + fout.add_description("two-tower CLIP model") + +if has_text_encoder: + assert t_hparams is not None + assert tokens is not None + # text_model hparams + fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) + fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"])) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) + fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) + fout.add_token_list(tokens) + +if has_vision_encoder: + # vision_model hparams + fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) + fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) + fout.add_uint32("clip.vision.projection_dim", 0) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), v_hparams["num_hidden_layers"]) + + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +fout.add_bool("clip.use_gelu", True) + + +if has_glm_projector: + # model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue] + projector = torch.load(args.llava_projector) + for name, data in projector.items(): + name = get_tensor_name(name) + # pw and dw conv ndim==4 + if data.ndim == 2 or data.ndim == 4: + data = data.squeeze().numpy().astype(np.float16) + else: + data = data.squeeze().numpy().astype(np.float32) + if name.startswith("vision."): + name=name.replace("vision.","") + fout.add_tensor(name, data) + print(f"Projector {name} - {data.dtype} - shape = {data.shape}") + # print(f"Projector {name} tensors added\n") + +state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue] +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_glm_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + # print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + # print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + # print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + print(f"siglip {name} - {data.dtype} - shape = {data.shape}") + # print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py b/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py new file mode 100644 index 0000000..16bb915 --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py @@ -0,0 +1,33 @@ +import argparse +import os +import torch +from transformers import AutoModel + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to GLM model") +args = ap.parse_args() + +# find the model part that includes the the multimodal projector weights +model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True) +checkpoint = model.state_dict() + +# get a list of mm tensor names +mm_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.adapter.")] + +# store these tensors in a new dictionary and torch.save them +projector = {name: checkpoint[name].float() for name in mm_tensors} +torch.save(projector, f"{args.model}/glm.projector") + +clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.vit.model.vision_model.")] +if len(clip_tensors) > 0: + clip = {name.replace("vision.vit.model.", ""): checkpoint[name].float() for name in clip_tensors} + torch.save(clip, f"{args.model}/glm.clip") + + # added tokens should be removed to be able to convert Mistral models + if os.path.exists(f"{args.model}/added_tokens.json"): + with open(f"{args.model}/added_tokens.json", "w") as f: + f.write("{}\n") + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}glm.projector to prepare a glm-encoder.gguf file.") diff --git a/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py b/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py new file mode 100644 index 0000000..4f2da3b --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py @@ -0,0 +1,38 @@ +import argparse +import glob +import os +import torch + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model") +args = ap.parse_args() + +# find the model part that includes the the multimodal projector weights +path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1] +checkpoint = torch.load(path) + +# get a list of mm tensor names +mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")] + +# store these tensors in a new dictionary and torch.save them +projector = {name: checkpoint[name].float() for name in mm_tensors} +torch.save(projector, f"{args.model}/llava.projector") + +# BakLLaVA models contain CLIP tensors in it +clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")] +if len(clip_tensors) > 0: + clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors} + torch.save(clip, f"{args.model}/llava.clip") + + + # added tokens should be removed to be able to convert Mistral models + if os.path.exists(f"{args.model}/added_tokens.json"): + with open(f"{args.model}/added_tokens.json", "w") as f: + f.write("{}\n") + + + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py b/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py new file mode 100644 index 0000000..b07c3e3 --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py @@ -0,0 +1,180 @@ +import argparse +import glob +import os +import torch +from safetensors import safe_open +from safetensors.torch import save_file +from typing import Any, ContextManager, cast + +# Function to determine if file is a SafeTensor file +def is_safetensor_file(file_path): + return file_path.endswith('.safetensors') + + +# Unified loading function +def load_model(file_path): + if is_safetensor_file(file_path): + tensors = {} + with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f: + for key in f.keys(): + tensors[key] = f.get_tensor(key).clone() + # output shape + print(f"{key} : {tensors[key].shape}") + return tensors, 'safetensor' + else: + return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' + + +# Unified saving function +def save_model(model, file_path, file_type): + if file_type == 'safetensor': + # safe_save(model, file_path) + save_file(model, file_path) + else: + torch.save(model, file_path) + +# Helpers to match weight names from specific components or +# determine if a saved shard contains that component +def is_vision_tower(weight_name): + return ( + weight_name.startswith("model.vision_tower") or + weight_name.startswith("vit.") or + weight_name.startswith("vision_tower") + ) + +def is_newline(weight_name): + return ( + weight_name.startswith("model.image_newline") or + weight_name.startswith("image_newline") + ) + +def is_mm_projector(weight_name): + return ( + weight_name.startswith("model.mm_projector") or + weight_name.startswith("vision_proj.") or + weight_name.startswith("multi_modal_projector") + ) + +def newline_criteria(checkpoint): + return any(is_newline(k) for k in checkpoint.keys()) + +def proj_criteria(checkpoint): + return any(is_mm_projector(k) for k in checkpoint.keys()) + +# Adapted function to clean vision tower from checkpoint +def clean_vision_tower_from_checkpoint(checkpoint_path): + checkpoint, file_type = load_model(checkpoint_path) + # file_type = 'pytorch' + model_path = os.path.dirname(checkpoint_path) + print(f"Searching for vision tower tensors in {checkpoint_path}") + clip_tensors = [k for k, v in checkpoint.items() if is_vision_tower(k)] + + if len(clip_tensors) > 0: + print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") + # Adapted for file type + clip_path = os.path.join(model_path, "llava.clip") + + if os.path.exists(clip_path): + print(f"Loading existing llava.clip from {clip_path}") + existing_clip, _ = load_model(clip_path) + else: + print(f"Creating new llava.clip at {clip_path}") + existing_clip = {} + # Update existing_clip with new tensors, avoid duplicates + for name in clip_tensors: + simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name + print(f"Adding {simple_name} to llava.clip") + if simple_name not in existing_clip: + existing_clip[simple_name] = checkpoint[name] + + # Save the updated clip tensors back to llava.clip + save_model(existing_clip, clip_path, 'pytorch') + + # Remove the tensors from the original checkpoint + for name in clip_tensors: + del checkpoint[name] + + checkpoint_path = checkpoint_path + return True + return False + +def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector): + newline_checkpoint_path = None + projector_checkpoint_path = None + + for path in checkpoint_paths: + checkpoint, _ = load_model(path) + if newline_criteria(checkpoint) and newline_checkpoint_path is None: + newline_checkpoint_path = path + if projector(checkpoint): + projector_checkpoint_path = path + + return newline_checkpoint_path, projector_checkpoint_path + + +# Command-line interface setup +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model") +ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files") +args = ap.parse_args() + +if args.clean_vision_tower: + # Generalized to handle both PyTorch and SafeTensors models + model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) + # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))] + checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] + for projector_checkpoint_path in checkpoint_paths: + print(f"Cleaning {projector_checkpoint_path}") + if not clean_vision_tower_from_checkpoint(projector_checkpoint_path): + print(f"No vision tower found in {projector_checkpoint_path}") + # we break once none is found, so far all models append them at the end + # break + print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.") + +# Now we look for the projector in the last checkpoint +model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) +checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] +# last_checkpoint_path = checkpoint_paths[0] +# first_checkpoint_path = checkpoint_paths[-1] +newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria) + +print(f"Taking projector from {projector_checkpoint_path}") +first_mm_tensors = [] +first_checkpoint = None +if newline_checkpoint_path is not None: + print(f"Taking newline from {newline_checkpoint_path}") + first_checkpoint, file_type = load_model(newline_checkpoint_path) + first_mm_tensors = [k for k, v in first_checkpoint.items() if is_newline(k)] + +# Load the checkpoint +mm_tensors = [] +last_checkpoint = None +if projector_checkpoint_path is not None: + last_checkpoint, file_type = load_model(projector_checkpoint_path) + mm_tensors = [k for k, v in last_checkpoint.items() if is_mm_projector(k)] + +if len(mm_tensors) == 0: + if last_checkpoint is not None: + for k, v in last_checkpoint.items(): + print(k) + print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.") + print("No tensors found. Is this a LLaVA model?") + exit() + +print(f"Found {len(mm_tensors)} tensors to extract.") +print(f"Found additional {len(first_mm_tensors)} tensors to extract.") +# projector = {name: checkpoint.[name].float() for name in mm_tensors} +projector = {} +for name in mm_tensors: + assert last_checkpoint is not None + projector[name] = last_checkpoint[name].float() +for name in first_mm_tensors: + assert first_checkpoint is not None + projector[name] = first_checkpoint[name].float() + +if len(projector) > 0: + save_model(projector, f"{args.model}/llava.projector", 'pytorch') + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py new file mode 100644 index 0000000..944037e --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -0,0 +1,892 @@ +# coding=utf-8 +# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Siglip model. """ +# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes + + +import os +import math +import warnings + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn.init import _calculate_fan_in_and_fan_out + +from transformers.activations import ACT2FN +from transformers.modeling_utils import PreTrainedModel +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import ( + logging, +) +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +class SiglipVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a + Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip + [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of channels in the input images. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + Example: + ```python + >>> from transformers import SiglipVisionConfig, SiglipVisionModel + >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration + >>> configuration = SiglipVisionConfig() + >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration + >>> model = SiglipVisionModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "siglip_vision_model" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=16, + hidden_act="gelu_pytorch_tanh", + layer_norm_eps=1e-6, + attention_dropout=0.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + +_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" + +SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "google/siglip-base-patch16-224", + # See all SigLIP models at https://huggingface.co/models?filter=siglip +] + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def _trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + if tensor.dtype in [torch.float16, torch.bfloat16]: + # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu + og_dtype = tensor.dtype + tensor = tensor.to(torch.float32) + tensor.erfinv_() + tensor = tensor.to(og_dtype) + else: + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + if tensor.dtype == torch.float16: + # The `clamp_` op is not (yet?) defined in float16+cpu + tensor = tensor.to(torch.float32) + tensor.clamp_(min=a, max=b) + tensor = tensor.to(torch.float16) + else: + tensor.clamp_(min=a, max=b) + + +def trunc_normal_tf_( + tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0 +): + """Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \\leq \text{mean} \\leq b`. + NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the + bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 + and the result is subsquently scaled and shifted by the mean and std args. + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + """ + with torch.no_grad(): + _trunc_normal_(tensor, 0, 1.0, a, b) + tensor.mul_(std).add_(mean) + + +def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) + denom = fan_in + if mode == "fan_in": + denom = fan_in + elif mode == "fan_out": + denom = fan_out + elif mode == "fan_avg": + denom = (fan_in + fan_out) / 2 + + variance = scale / denom + + if distribution == "truncated_normal": + # constant is stddev of standard normal truncated to (-2, 2) + trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) + elif distribution == "normal": + with torch.no_grad(): + tensor.normal_(std=math.sqrt(variance)) + elif distribution == "uniform": + bound = math.sqrt(3 * variance) + with torch.no_grad(): + tensor.uniform_(-bound, bound) + else: + raise ValueError(f"invalid distribution {distribution}") + + +def lecun_normal_(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") + + +def default_flax_embed_init(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="normal") + +class SiglipVisionEmbeddings(nn.Module): + def __init__(self, config: SiglipVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + padding="valid", + ) + + self.num_patches_per_side = self.image_size // self.patch_size + self.num_patches = self.num_patches_per_side**2 + self.num_positions = self.num_patches + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + +class SiglipAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip +class SiglipMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + +# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip +class SiglipEncoderLayer(nn.Module): + def __init__(self, config: SiglipVisionConfig): + super().__init__() + self.embed_dim = config.hidden_size + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self.self_attn = ( + SiglipAttention(config) + ) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = SiglipMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + +class SiglipPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SiglipVisionConfig + base_model_prefix = "siglip" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + + if isinstance(module, SiglipVisionEmbeddings): + width = self.config.hidden_size + nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width)) + elif isinstance(module, nn.Embedding): + default_flax_embed_init(module.weight) + elif isinstance(module, SiglipAttention): + nn.init.normal_(module.q_proj.weight) + nn.init.normal_(module.k_proj.weight) + nn.init.normal_(module.v_proj.weight) + nn.init.normal_(module.out_proj.weight) + nn.init.zeros_(module.q_proj.bias) + nn.init.zeros_(module.k_proj.bias) + nn.init.zeros_(module.v_proj.bias) + nn.init.zeros_(module.out_proj.bias) + elif isinstance(module, SiglipMLP): + nn.init.normal_(module.fc1.weight) + nn.init.normal_(module.fc2.weight) + nn.init.normal_(module.fc1.bias, std=1e-6) + nn.init.normal_(module.fc2.bias, std=1e-6) + elif isinstance(module, (nn.Linear, nn.Conv2d)): + lecun_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +SIGLIP_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + Parameters: + config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +SIGLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip +class SiglipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`SiglipEncoderLayer`]. + Args: + config: SiglipConfig + """ + + def __init__(self, config: SiglipVisionConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + +class SiglipVisionTransformer(SiglipPreTrainedModel): + config_class = SiglipVisionConfig + main_input_name = "pixel_values" + _supports_flash_attn_2 = True + + def __init__(self, config: SiglipVisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = SiglipVisionEmbeddings(config) + self.encoder = SiglipEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.embeddings.patch_embedding + +import argparse +import json +import re + +import numpy as np +from gguf import * +from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer +from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig + +TEXT = "clip.text" +VISION = "clip.vision" + + +def add_key_str(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if has_minicpmv and name in ["visual_projection.weight"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.5, 0.5, 0.5] +default_image_std = [0.5, 0.5, 0.5] +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) +ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6; MiniCPM-o-4.5 use 100045', default=2) + +# with proper +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +# Read config.json to get actual model configuration +config_path = os.path.join(dir_model, "config.json") +model_config = {} +if os.path.isfile(config_path): + with open(config_path, "r", encoding="utf-8") as f: + model_config = json.load(f) + print(f"Loaded config from {config_path}") +else: + print(f"Warning: config.json not found at {config_path}") + +# If minicpmv_projector is not specified but the default path exists, use the default path +if args.minicpmv_projector is None: + default_projector_path = os.path.join(dir_model, "minicpmv.projector") + if os.path.isfile(default_projector_path): + args.minicpmv_projector = default_projector_path + print(f"Found default projector file: {default_projector_path}") + +# If output_dir is not specified, use model_dir as the default value +if args.output_dir is None: + args.output_dir = dir_model + +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +# if args.clip_model_is_vision or args.clip_model_is_openclip: +# model = CLIPVisionModel.from_pretrained(dir_model) +# processor = None +# else: +# model = CLIPModel.from_pretrained(dir_model) +# processor = CLIPProcessor.from_pretrained(dir_model) + +minicpmv_version = args.minicpmv_version + +# Use actual config values instead of hardcoded ones +if model_config: + # For the projector/resampler, use the main model's hidden_size + emb_dim = model_config.get("hidden_size", 1536) + + # For the vision model, use vision_config values + vision_config_dict = model_config.get("vision_config", {}) + default_vision_config = { + "hidden_size": vision_config_dict.get("hidden_size", 1152), + "image_size": vision_config_dict.get("image_size", 980), + "intermediate_size": vision_config_dict.get("intermediate_size", 4304), + "model_type": vision_config_dict.get("model_type", "siglip"), + "num_attention_heads": vision_config_dict.get("num_attention_heads", 16), + "num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27), + "patch_size": vision_config_dict.get("patch_size", 14), + } + + # Use vision model's num_hidden_layers for block_count + block_count = vision_config_dict.get("num_hidden_layers", 27) + + print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}") + print(f"Vision config: {default_vision_config}") +else: + # Fallback to original hardcoded logic if config.json not found + emb_dim = 4096 + block_count = 26 + if minicpmv_version == 1: + emb_dim = 2304 + block_count = 26 + elif minicpmv_version == 2: + emb_dim = 4096 + block_count = 27 + elif minicpmv_version == 3: + emb_dim = 3584 + block_count = 27 + elif minicpmv_version == 4: + emb_dim = 3584 + block_count = 27 + elif minicpmv_version == 5: + emb_dim = 2560 + block_count = 27 + elif minicpmv_version == 6: + emb_dim = 4096 + block_count = 27 + elif minicpmv_version == 100045: + emb_dim = 4096 + block_count = 27 + + default_vision_config = { + "hidden_size": 1152, + "image_size": 980, + "intermediate_size": 4304, + "model_type": "idefics2", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + } + +vision_config = Idefics2VisionConfig(**default_vision_config) +model = Idefics2VisionTransformer(vision_config) +if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"): + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 4: + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 5: + default_vision_config["model_type"] = "siglip_vision_model" + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 6: + default_vision_config["model_type"] = "siglip_vision_model" + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 100045: + default_vision_config["model_type"] = "siglip_vision_model" + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) + +processor = None +# if model.attn_pool is not None: +# model.attn_pool = torch.nn.Identity() + +# model.blocks = model.blocks[:-1] +model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip"))) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_minicpmv_projector = False + +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.minicpmv_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_minicpmv_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector) +fout.add_file_type(ftype) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_minicpmv_projector: + fout.add_description("vision-only CLIP model") +elif has_minicpmv_projector: + fout.add_description("image encoder for MiniCPM-V") + # add projector type + fout.add_string("clip.projector_type", "resampler") + fout.add_int32("clip.minicpmv_version", minicpmv_version) +else: + fout.add_description("two-tower CLIP model") + +if has_vision_encoder: + # vision_model hparams - use actual config values + vision_image_size = model_config.get("image_size", 448) if model_config else 448 + vision_patch_size = default_vision_config.get("patch_size", 14) + vision_hidden_size = default_vision_config.get("hidden_size", 1152) + vision_intermediate_size = default_vision_config.get("intermediate_size", 4304) + vision_attention_heads = default_vision_config.get("num_attention_heads", 16) + + fout.add_uint32("clip.vision.image_size", vision_image_size) + fout.add_uint32("clip.vision.patch_size", vision_patch_size) + fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size) + fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size) + fout.add_uint32("clip.vision.projection_dim", 0) + fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads) + fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count) + + # Add MiniCPM-V specific parameters + query_num = model_config.get("query_num", 0) if model_config else 0 + resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0 + fout.add_uint32("clip.minicpmv_query_num", query_num) + + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +use_gelu = True +fout.add_bool("clip.use_gelu", use_gelu) + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + +def _replace_name_resampler(s, v): + if re.match("resampler.pos_embed", s): + return { + s: v, + re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))), + } + if re.match("resampler.proj", s): + return { + re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))), + re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(), + } + if re.match("resampler.attn.in_proj_.*", s): + return { + re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0], + re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1], + re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2], + } + return {s: v} + +if has_minicpmv_projector: + projector = torch.load(args.minicpmv_projector) + new_state_dict = {} + for k, v in projector.items(): + kvs = _replace_name_resampler(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv + projector = new_state_dict + ftype_cur = 0 + for name, data in projector.items(): + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + if ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + fout.add_tensor(name, data) + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + + print("Projector tensors added\n") + +def _replace_name(s, v): + s = "vision_model." + s + if re.match("vision_model.embeddings.position_embedding", s): + v = v.unsqueeze(0) + return {s: v} + + return {s: v} + +state_dict = model.state_dict() +new_state_dict = {} +for k, v in state_dict.items(): + kvs = _replace_name(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv +state_dict = new_state_dict +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py b/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py new file mode 100644 index 0000000..5352662 --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py @@ -0,0 +1,47 @@ +import argparse +import os +import torch +from transformers import AutoModel, AutoTokenizer + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to MiniCPM-V model") +args = ap.parse_args() + +# find the model part that includes the the multimodal projector weights +model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16) +checkpoint = model.state_dict() + +# get a list of mm tensor names +mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")] + +# store these tensors in a new dictionary and torch.save them +projector = {name: checkpoint[name].float() for name in mm_tensors} +if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True: + projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb +torch.save(projector, f"{args.model}/minicpmv.projector") + +clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")] +if len(clip_tensors) > 0: + clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors} + torch.save(clip, f"{args.model}/minicpmv.clip") + + # added tokens should be removed to be able to convert Mistral models + if os.path.exists(f"{args.model}/added_tokens.json"): + with open(f"{args.model}/added_tokens.json", "w") as f: + f.write("{}\n") + +config = model.llm.config +config.auto_map = { + "AutoConfig": "configuration_minicpm.MiniCPMConfig", + "AutoModel": "modeling_minicpm.MiniCPMModel", + "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification" +} +model.llm.save_pretrained(f"{args.model}/model") +tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) +tok.save_pretrained(f"{args.model}/model") + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.") diff --git a/llama.cpp/tools/mtmd/models/cogvlm.cpp b/llama.cpp/tools/mtmd/models/cogvlm.cpp new file mode 100644 index 0000000..d5b739c --- /dev/null +++ b/llama.cpp/tools/mtmd/models/cogvlm.cpp @@ -0,0 +1,98 @@ +#include "models.h" + +ggml_cgraph * clip_graph_cogvlm::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; // +1 for [CLS] + + // build input and concatenate class embedding + ggml_tensor * inp = build_inp(); + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + inp = ggml_add(ctx0, inp, model.position_embeddings); + cb(inp, "inp_pos", -1); + + ggml_tensor * inpL = inp; + + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + + cur = ggml_add(ctx0, cur, layer.qkv_b); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], 0); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], n_embd * sizeof(float)); + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], 2 * n_embd * sizeof(float)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "attn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + inpL = cur; + + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "layer_out", il); + inpL = cur; + + } + + // remove CLS token (like build_llama4 does) + ggml_tensor * cur = ggml_view_2d(ctx0, inpL, + n_embd, n_patches, + ggml_row_size(inpL->type, n_embd), 0); + + // Multiply with mm_model_proj + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + + // Apply layernorm, weight, bias + cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + + // Apply GELU + cur = ggml_gelu_inplace(ctx0, cur); + + // Branch 1: multiply with mm_h_to_4h_w + ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur); + + // Branch 2: multiply with mm_gate_w + ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); + + // Apply silu + gate = ggml_swiglu_split(ctx0, gate, h_to_4h); + + // Apply mm_4h_to_h_w + cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate); + + // Concatenate with boi and eoi + cur = ggml_concat(ctx0, model.mm_boi, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/conformer.cpp b/llama.cpp/tools/mtmd/models/conformer.cpp new file mode 100644 index 0000000..9b1fab4 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/conformer.cpp @@ -0,0 +1,216 @@ +#include "models.h" + +ggml_cgraph * clip_graph_conformer::build() { + const int n_frames = img.nx; + const int n_pos = n_frames / 2; + const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; + GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); + + ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd); + ggml_set_name(pos_emb, "pos_emb"); + ggml_set_input(pos_emb); + ggml_build_forward_expand(gf, pos_emb); + + ggml_tensor * inp = build_inp_raw(1); + + auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + + // pre encode, conv subsampling + { + // layer.0 - conv2d + cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]); + cb(cur, "conformer.pre_encode.conv.{}", 0); + + // layer.1 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // layer.2 conv2d dw + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]); + cb(cur, "conformer.pre_encode.conv.{}", 2); + + // layer.3 conv2d + cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]); + cb(cur, "conformer.pre_encode.conv.{}", 3); + + // layer.4 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // layer.5 conv2d dw + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]); + cb(cur, "conformer.pre_encode.conv.{}", 5); + + // layer.6 conv2d + cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]); + cb(cur, "conformer.pre_encode.conv.{}", 6); + + // layer.7 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // flatten channel and frequency axis + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); + + // calculate out + cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur); + cur = ggml_add(ctx0, cur, model.pre_encode_out_b); + cb(cur, "conformer.pre_encode.out", -1); + } + + // pos_emb + cb(pos_emb, "pos_emb", -1); + + for (int il = 0; il < hparams.n_layer; il++) { + const auto & layer = model.layers[il]; + + auto * residual = cur; + + cb(cur, "layer.in", il); + + // feed_forward1 + cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_feed_forward1", il); + + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU, + il); + cb(cur, "conformer.layers.{}.feed_forward1.linear2", il); + + const auto fc_factor = 0.5f; + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + + // self-attention + { + cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_self_att", il); + + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]); + ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); + Q_bias_u = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3); + ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v); + Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3); + + // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]); + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]); + Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3)); + + // build_attn won't fit due to matrix_ac and matrix_bd separation + ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur); + matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3)); + cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il); + + auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); + cb(p, "conformer.layers.{}.self_attn.linear_pos", il); + p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]); + p = ggml_permute(ctx0, p, 0, 2, 1, 3); + + auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p); + matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3)); + + // rel shift + { + const auto pos_len = matrix_bd->ne[0]; + const auto q_len = matrix_bd->ne[1]; + const auto h = matrix_bd->ne[2]; + matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h); + matrix_bd = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1], + matrix_bd->nb[2], matrix_bd->nb[0] * q_len); + matrix_bd = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h); + } + + matrix_bd = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1], + matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0); + auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd); + scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head)); + cb(scores, "conformer.layers.{}.self_attn.id0", il); + + ggml_tensor * attn = ggml_soft_max(ctx0, scores); + ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur); + x = ggml_permute(ctx0, x, 2, 0, 1, 3); + x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); + + ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x); + out = ggml_add(ctx0, out, layer.o_b); + cb(out, "conformer.layers.{}.self_attn.linear_out", il); + + cur = out; + } + + residual = ggml_add(ctx0, residual, cur); + cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_conv", il); + + // conv + { + auto * x = cur; + x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x); + x = ggml_add(ctx0, x, layer.conv_pw1_b); + cb(x, "conformer.layers.{}.conv.pointwise_conv1", il); + + // ggml_glu doesn't support sigmoid + // TODO @ngxson : support this ops in ggml + { + int64_t d = x->ne[0] / 2; + ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); + x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + } + + // use ggml_ssm_conv for f32 precision + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_roll(ctx0, x, 4, 0, 0, 0); + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w); + x = ggml_add(ctx0, x, layer.conv_dw_b); + + x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b); + x = ggml_silu(ctx0, x); + + // pointwise_conv2 + x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x); + x = ggml_add(ctx0, x, layer.conv_pw2_b); + + cur = x; + } + + residual = ggml_add(ctx0, residual, cur); + + cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_feed_forward2", il); + + cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b, + FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams + cb(cur, "conformer.layers.{}.feed_forward2.linear2", il); + + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + cb(residual, "conformer.layers.{}.conv.id", il); + + cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_out", il); + } + + // audio adapter + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cb(cur, "audio_adapter.model.{}", 0); + cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1); + + cb(cur, "projected", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/glm4v.cpp b/llama.cpp/tools/mtmd/models/glm4v.cpp new file mode 100644 index 0000000..f39b692 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/glm4v.cpp @@ -0,0 +1,120 @@ +#include "models.h" + +ggml_cgraph * clip_graph_glm4v::build() { + GGML_ASSERT(model.patch_bias != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + + norm_type norm_t = NORM_TYPE_RMS; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + // add patch bias + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + + // pos-conv norm + inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1); + + // calculate absolute position embedding and apply + ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC); + learned_pos_embd = ggml_cont_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + learned_pos_embd = ggml_reshape_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); + learned_pos_embd = ggml_cont_3d( + ctx0, learned_pos_embd, + n_embd, n_patches_x * n_patches_y, batch_size); + cb(learned_pos_embd, "learned_pos_embd", -1); + + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return ggml_rope_multi( + ctx0, cur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, + 32768, hparams.rope_theta, 1, 0, 1, 32, 1); + }; + + ggml_tensor * cur = build_vit( + inp, n_patches, + norm_t, + hparams.ffn_op, + learned_pos_embd, + add_pos); + + cb(cur, "vit_out", -1); + // cb(ggml_sum(ctx0, cur), "vit_out_sum", -1); + + // GLM4V projector + // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130 + + // patch merger (downsample) + { + int n_merge = hparams.n_merge; + GGML_ASSERT(n_merge > 0); + + int n_token_out = n_patches / n_merge / n_merge; + cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out] + cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out] + + cur = ggml_add(ctx0, cur, model.mm_patch_merger_b); + } + + // FC projector + { + cur = ggml_mul_mat(ctx0, model.projection, cur); + // default LayerNorm (post_projection_norm) + cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + cur = ggml_gelu_erf(ctx0, cur); + cb(cur, "after_fc_proj", -1); + } + + // FFN projector + { + cur = build_ffn(cur, + model.mm_ffn_up_w, model.mm_ffn_up_b, + model.mm_ffn_gate_w, model.mm_ffn_gate_b, + model.mm_ffn_down_w, model.mm_ffn_down_b, + hparams.ffn_op, -1); + cb(cur, "after_ffn_proj", -1); + // cb(ggml_sum(ctx0, cur), "merged_sum", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/internvl.cpp b/llama.cpp/tools/mtmd/models/internvl.cpp new file mode 100644 index 0000000..9aded3b --- /dev/null +++ b/llama.cpp/tools/mtmd/models/internvl.cpp @@ -0,0 +1,69 @@ +#include "models.h" + +ggml_cgraph * clip_graph_internvl::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; + ggml_tensor * inp = build_inp(); + + // add CLS token + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // The larger models use a different ViT, which uses RMS norm instead of layer norm + // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 + norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) + ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B) + : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models) + + ggml_tensor * cur = build_vit( + inp, n_pos, + norm_t, + hparams.ffn_op, + model.position_embeddings, + nullptr); + + // remove CLS token + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // pixel shuffle + { + const int scale_factor = model.hparams.n_merge; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = n_patches_y; + const int width = n_patches_x; + GGML_ASSERT(scale_factor > 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_cont_4d(ctx0, cur, + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + // flatten to 2D + cur = ggml_cont_2d(ctx0, cur, + n_embd * scale_factor * scale_factor, + cur->ne[1] * cur->ne[2]); + } + + // projector (always using GELU activation) + { + // projector LayerNorm uses pytorch's default eps = 1e-5 + // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79 + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_3_w, model.mm_3_b, + FFN_GELU, + -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/kimik25.cpp b/llama.cpp/tools/mtmd/models/kimik25.cpp new file mode 100644 index 0000000..cf9f27f --- /dev/null +++ b/llama.cpp/tools/mtmd/models/kimik25.cpp @@ -0,0 +1,101 @@ +#include "models.h" +#include <cstring> +#include <cmath> + +// note: this is similar to clip_graph::resize_position_embeddings, major difference is having +// the w/h in ne[1] and ne[2] instead of assuming with sqrt. Could try storing the tensor in 2D instead +// with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3). +ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) { + ggml_tensor * pos_embd = model.position_embeddings; + const int height = img.ny / patch_size; + const int width = img.nx / patch_size; + const uint32_t mode = interpolation_mode; + + GGML_ASSERT(pos_embd); + + const int64_t stored_c = pos_embd->ne[0]; // C = 1152 + const int64_t orig_w = pos_embd->ne[1]; // W = 64 + const int64_t orig_h = pos_embd->ne[2]; // H = 64 + + GGML_ASSERT(stored_c == n_embd); + + if (height == (int)orig_h && width == (int)orig_w) { + // No interpolation needed, just flatten to [C, H*W] + return ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); + } + + pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3); + pos_embd = ggml_interpolate(ctx0, pos_embd, height, width, n_embd, 1, mode); + pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3); + pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); + return pos_embd; +} + +ggml_cgraph * clip_graph_kimik25::build() { + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC); + + // Kimi-K2.5 uses interleaved 2D RoPE pattern natively, but + // Q / K are permuted during conversion to use split format. + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + return cur; + }; + + ggml_tensor * inp = build_inp(); + + // I don't know why, but doing this in the build_vit lead to the ggml_add not occurring? + // Doing it manually here does work. + inp = ggml_add(ctx0, inp, learned_pos_embd); + + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + nullptr, + add_pos); + + cb(cur, "vit_out", -1); + + { + // patch_merger + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection norm + int proj_inp_dim = cur->ne[0]; + int n_merged_patches = cur->ne[1]; + cur = ggml_view_2d(ctx0, cur, + n_embd, n_merged_patches * scale_factor * scale_factor, + ggml_row_size(cur->type, n_embd), 0); + cur = ggml_norm(ctx0, cur, hparams.eps); + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + cur = ggml_view_2d(ctx0, cur, + proj_inp_dim, n_merged_patches, + ggml_row_size(cur->type, proj_inp_dim), 0); + cb(cur, "proj_inp_normed", -1); + + // projection mlp + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + + cb(cur, "proj_out", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/kimivl.cpp b/llama.cpp/tools/mtmd/models/kimivl.cpp new file mode 100644 index 0000000..0a06f50 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/kimivl.cpp @@ -0,0 +1,63 @@ +#include "models.h" + +ggml_cgraph * clip_graph_kimivl::build() { + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + + // build ViT with 2D position embeddings + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + // first half is X axis and second half is Y axis + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + add_pos); + + cb(cur, "vit_out", -1); + + { + // patch_merger + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection norm + int proj_inp_dim = cur->ne[0]; + cur = ggml_view_2d(ctx0, cur, + n_embd, cur->ne[1] * scale_factor * scale_factor, + ggml_row_size(cur->type, n_embd), 0); + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + cur = ggml_view_2d(ctx0, cur, + proj_inp_dim, cur->ne[1] / scale_factor / scale_factor, + ggml_row_size(cur->type, proj_inp_dim), 0); + cb(cur, "proj_inp_normed", -1); + + // projection mlp + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + cb(cur, "proj_out", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/llama4.cpp b/llama.cpp/tools/mtmd/models/llama4.cpp new file mode 100644 index 0000000..30d1df5 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/llama4.cpp @@ -0,0 +1,96 @@ +#include "models.h" + +ggml_cgraph * clip_graph_llama4::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; // +1 for [CLS] + + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * inp = build_inp_raw(); + + // Llama4UnfoldConvolution + { + ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0, + patch_size, patch_size, 3, n_embd); + inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type); + inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); + inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); + cb(inp, "patch_conv", -1); + } + + // add CLS token + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // build ViT with 2D position embeddings + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + // first half is X axis and second half is Y axis + // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312 + // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441 + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + model.position_embeddings, + add_pos); + + // remove CLS token + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // pixel shuffle + // based on Llama4VisionPixelShuffleMLP + // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151 + { + const int scale_factor = model.hparams.n_merge; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + GGML_ASSERT(scale_factor > 0); + GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images + cur = ggml_reshape_4d(ctx0, cur, + n_embd * scale_factor, + n_patches_x / scale_factor, + n_patches_y, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_cont_4d(ctx0, cur, + n_embd * scale_factor * scale_factor, + n_patches_x / scale_factor, + n_patches_y / scale_factor, + bsz); + //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + // flatten to 2D + cur = ggml_cont_2d(ctx0, cur, + n_embd * scale_factor * scale_factor, + n_patches / scale_factor / scale_factor); + cb(cur, "pixel_shuffle", -1); + } + + // based on Llama4VisionMLP2 (always uses GELU activation, no bias) + { + cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur); + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur); + cur = ggml_gelu(ctx0, cur); + cb(cur, "adapter_mlp", -1); + } + + // Llama4MultiModalProjector + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + cb(cur, "projected", -1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/llava.cpp b/llama.cpp/tools/mtmd/models/llava.cpp new file mode 100644 index 0000000..0bfb5f0 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/llava.cpp @@ -0,0 +1,374 @@ +#include "models.h" + +// this graph is used by llava, granite and glm +// due to having embedding_stack (used by granite), we cannot reuse build_vit +ggml_cgraph * clip_graph_llava::build() { + const int batch_size = 1; + const int n_pos = n_patches + (model.class_embedding ? 1 : 0); + + GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported"); + + // Calculate the deepest feature layer based on hparams and projector type + int max_feature_layer = n_layer; + { + // Get the index of the second to last layer; this is the default for models that have a llava projector + int il_last = hparams.n_layer - 1; + int deepest_feature_layer = -1; + + if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) { + il_last += 1; + } + + // If we set explicit vision feature layers, only go up to the deepest one + // NOTE: only used by granite-vision models for now + for (const auto & feature_layer : hparams.vision_feature_layer) { + if (feature_layer > deepest_feature_layer) { + deepest_feature_layer = feature_layer; + } + } + max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer; + } + + ggml_tensor * inp = build_inp(); + + // concat class_embeddings and patch_embeddings + if (model.class_embedding) { + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + } + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); + + ggml_tensor * inpL = inp; + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1); + cb(inpL, "pre_ln", -1); + } + + std::vector<ggml_tensor *> embedding_stack; + const auto & vision_feature_layer = hparams.vision_feature_layer; + + // loop over layers + for (int il = 0; il < max_feature_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // If this is an embedding feature layer, save the output. + // NOTE: 0 index here refers to the input to the encoder. + if (vision_feature_layer.find(il) != vision_feature_layer.end()) { + embedding_stack.push_back(cur); + } + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "layer_inp_normed", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + if (layer.q_b) { + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + } + + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + if (layer.k_b) { + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + } + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + if (layer.v_b) { + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1); + } + + ggml_tensor * embeddings = inpL; + + // process vision feature layers (used by granite) + { + // final layer is a vision feature layer + if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) { + embedding_stack.push_back(inpL); + } + + // If feature layers are explicitly set, stack them (if we have multiple) + if (!embedding_stack.empty()) { + embeddings = embedding_stack[0]; + for (size_t i = 1; i < embedding_stack.size(); i++) { + embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0); + } + } + } + + // llava projector (also used by granite) + if (hparams.has_llava_projector) { + embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); + + ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(patches, "patches"); + ggml_set_input(patches); + + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] + embeddings = ggml_get_rows(ctx0, embeddings, patches); + + // print_tensor_info(embeddings, "embeddings"); + + // llava projector + if (proj_type == PROJECTOR_TYPE_MLP) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + embeddings = ggml_gelu(ctx0, embeddings); + if (model.mm_2_w) { + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } + } + else if (proj_type == PROJECTOR_TYPE_MLP_NORM) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); + // First LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), + model.mm_1_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); + + // Second LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), + model.mm_4_b); + } + else if (proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projector + int n_patch = 24; + ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); + mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); + mlp_1 = ggml_gelu(ctx0, mlp_1); + ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); + mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); + // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] + + // block 1 + ggml_tensor * block_1 = nullptr; + { + // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] + mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3); + mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); + // stride = 1, padding = 1, bias is nullptr + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + + // layer norm + // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + + // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // hardswish + ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // residual + block_1 = ggml_add(ctx0, mlp_3, block_1); + } + + // block_2 + { + // stride = 2 + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // layer norm + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // hardswish + ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + // not sure the parameters is right for globalAvgPooling + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + + // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); + block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); + // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] + } + embeddings = block_1; + } + else if (proj_type == PROJECTOR_TYPE_LDPV2) + { + int n_patch = 24; + ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + embeddings = peg_0; + } + else { + GGML_ABORT("fatal error"); + } + } + + // glm projector + else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) { + size_t gridsz = (size_t)sqrt(embeddings->ne[1]); + embeddings = ggml_permute(ctx0,embeddings,1,0,2,3); + embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); + embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); + embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); + embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); + // GLU + { + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + embeddings = ggml_gelu_inplace(ctx0, embeddings); + ggml_tensor * x = embeddings; + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); + x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); + embeddings = ggml_swiglu_split(ctx0, embeddings, x); + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); + } + // arrangement of BOI/EOI token embeddings + // note: these embeddings are not present in text model, hence we cannot process them as text tokens + // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53 + { + embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI + embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI + } + } + + else { + GGML_ABORT("llava: unknown projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/minicpmv.cpp b/llama.cpp/tools/mtmd/models/minicpmv.cpp new file mode 100644 index 0000000..3594ea2 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/minicpmv.cpp @@ -0,0 +1,114 @@ +#include "models.h" + +ggml_cgraph * clip_graph_minicpmv::build() { + GGML_ASSERT(model.class_embedding == nullptr); + const int n_pos = n_patches; + const int n_embd_proj = n_mmproj_embd; + + // position embeddings for the projector (not for ViT) + // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70 + // base frequency omega + ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4); + ggml_set_name(omega, "omega"); + ggml_set_input(omega); + + // 2D input positions (using float for sinusoidal embeddings) + ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + // for selecting learned pos embd, used by ViT + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); + + ggml_tensor * inp = build_inp(); + ggml_tensor * embeddings = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + nullptr); + + // resampler projector (it is just another transformer) + + ggml_tensor * q = model.mm_model_query; + ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + + // norm + q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1); + v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1); + + // calculate sinusoidal pos embd + ggml_tensor * pos_embed = nullptr; + { + // outer product + ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows + ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w); + ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h); + // sin and cos + ggml_tensor * pos_embd_x = ggml_concat( + ctx0, + ggml_sin(ctx0, theta_x), + ggml_cos(ctx0, theta_x), + 0 // concat on first dim + ); + ggml_tensor * pos_embd_y = ggml_concat( + ctx0, + ggml_sin(ctx0, theta_y), + ggml_cos(ctx0, theta_y), + 0 // concat on first dim + ); + pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0); + } + + // k = v + pos_embed + ggml_tensor * k = ggml_add(ctx0, v, pos_embed); + + // attention + { + const int d_head = 128; + int n_head = n_embd_proj/d_head; + // Use actual config value if available, otherwise fall back to hardcoded values + int num_query = hparams.minicpmv_query_num; + ggml_tensor * Q = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), + model.mm_model_attn_q_b); + ggml_tensor * K = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), + model.mm_model_attn_k_b); + ggml_tensor * V = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), + model.mm_model_attn_v_b); + + Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query); + K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos); + V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos); + + cb(Q, "resampler_Q", -1); + cb(K, "resampler_K", -1); + cb(V, "resampler_V", -1); + + float resampler_kq_scale = 1.0f/ sqrtf(float(d_head)); + embeddings = build_attn( + model.mm_model_attn_o_w, + model.mm_model_attn_o_b, + Q, K, V, nullptr, resampler_kq_scale, -1); + cb(embeddings, "resampler_attn_out", -1); + } + // layernorm + embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1); + + // projection + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/mobilenetv5.cpp b/llama.cpp/tools/mtmd/models/mobilenetv5.cpp new file mode 100644 index 0000000..593afa1 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/mobilenetv5.cpp @@ -0,0 +1,451 @@ +#include "models.h" + +// Helpers for MobileNetV5 Blocks +// RMS Norm 2D - normalizes over channels for each spatial position +ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) { + // inp: [W, H, C, B] + + ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); + cur = ggml_cont(ctx0, cur); + cur = ggml_rms_norm(ctx0, cur, eps); + + if (weight) { + cur = ggml_mul(ctx0, cur, weight); + } + + cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); + cur = ggml_cont(ctx0, cur); + + return cur; +} + +// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF +ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { + const int64_t ih = inp->ne[1]; // height + const int64_t iw = inp->ne[0]; // width + + // Calculate output size (ceil division) + const int64_t oh = (ih + stride_h - 1) / stride_h; + const int64_t ow = (iw + stride_w - 1) / stride_w; + + // Calculate padding needed + const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih); + const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw); + + // Split padding asymmetrically + const int pad_h_top = pad_h / 2; + const int pad_h_bottom = pad_h - pad_h_top; + const int pad_w_left = pad_w / 2; + const int pad_w_right = pad_w - pad_w_left; + + // Apply padding if needed + // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3) + // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch + if (pad_h > 0 || pad_w > 0) { + inp = ggml_pad_ext(ctx0, inp, + pad_w_left, pad_w_right, // width padding (dim 0) + pad_h_top, pad_h_bottom, // height padding (dim 1) + 0, 0, // no channel padding (dim 2) + 0, 0); // no batch padding (dim 3) + } + + return inp; +} + + +// Edge Residual Block (Stage 0) +ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { + ggml_tensor * cur = inp; + + // 1. Expansion Conv (3x3) + if (stride == 2) { + // Case: Downsampling (Block 0) + // Replicates Conv2dSame(kernel=3, stride=2) + cur = pad_same_2d(cur, 3, 3, stride, stride); + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1); + } else { + // Case: Normal 3x3 Block (Block 1, 2) + // Replicates Conv2d(kernel=3, stride=1, padding=1) + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1); + } + + // BN + Activation + if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w); + cur = ggml_gelu(ctx0, cur); + + // 2. Pointwise Linear Conv (1x1) + // 1x1 Convs usually have padding=0 and stride=1 + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1); + if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w); + + // 3. Residual Connection + // Only apply residual if spatial dimensions and channels match (stride 1) + if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) { + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + +// Universal Inverted Residual Block (Stage 1+) +ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { + ggml_tensor * cur = inp; + + // 1. Depthwise Start (Optional) + // NOTE: dw_start always has stride=1 (no downsampling here) + if (block.dw_start_w) { + int k = block.dw_start_w->ne[0]; // 3 or 5 + int p = k / 2; + cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1); + if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w); + } + + // 2. Pointwise Expansion (1x1) + if (block.pw_exp_w) { + // Standard 1x1 conv, pad=0, stride=1 + cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1); + if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w); + cur = ggml_gelu(ctx0, cur); + } + + // 3. Depthwise Mid (Optional) + // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage) + if (block.dw_mid_w) { + int k = block.dw_mid_w->ne[0]; // 3 or 5 + + if (stride > 1) { + // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding + cur = pad_same_2d(cur, k, k, stride, stride); + cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0 + } else { + // Case: Stride 1 -> Use Standard Symmetric Padding + int p = k / 2; + cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1); + } + + if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w); + cur = ggml_gelu(ctx0, cur); + } + + // 4. Pointwise Projection (1x1) + if (block.pw_proj_w) { + cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1); + if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w); + } + + // Apply Layer Scaling if present + if (block.layer_scale_w) { + cur = ggml_mul(ctx0, cur, block.layer_scale_w); + } + + // 5. Residual Connection + bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]); + bool same_channel = (inp->ne[2] == cur->ne[2]); + if (same_spatial && same_channel) { + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + +// Attention Block (MQA) +ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) { + ggml_tensor * cur = inp; + + // Norm + if (block.attn_norm_w) { + cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f); + } + + // 1. Q Calculation + ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1); + + // 2. K Calculation (Downsampled) + // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) + ggml_tensor * k_inp = cur; + if (block.attn_k_dw_w) { + int k_size = block.attn_k_dw_w->ne[0]; // Usually 3 + k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding + k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0 + if (block.attn_k_norm_w) { + k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f); + } + } + ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1); + + // 3. V Calculation (Downsampled) + // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) + ggml_tensor * v_inp = cur; + if (block.attn_v_dw_w) { + int v_size = block.attn_v_dw_w->ne[0]; // Usually 3 + v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding + v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0 + if (block.attn_v_norm_w) { + v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f); + } + } + ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1); + + const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3]; + const int D = k->ne[2]; // Head dimension + const int n_head = q->ne[2] / D; + const int N = W * H; + + // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B] + q = ggml_reshape_3d(ctx0, q, N, D*n_head, B); + q = ggml_reshape_4d(ctx0, q, N, D, n_head, B); + q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B] + q = ggml_cont(ctx0, q); + + const int Wk = k->ne[0]; const int Hk = k->ne[1]; + const int M = Wk * Hk; + + // Process K: [Wk, Hk, D, B] -> [D, M, 1, B] + k = ggml_reshape_3d(ctx0, k, M, D, B); + k = ggml_reshape_4d(ctx0, k, M, D, 1, B); + k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B] + k = ggml_cont(ctx0, k); + + // Process V: [Wk, Hk, D, B] -> [M, D, 1, B] + v = ggml_reshape_3d(ctx0, v, M, D, B); + v = ggml_reshape_4d(ctx0, v, M, D, 1, B); + v = ggml_cont(ctx0, v); // [M, D, 1, B] + + // Multi-Query Attention + float scale = 1.0f / sqrtf((float)D); + + // Step 1: Compute Q @ K.T + ggml_tensor * scores = ggml_mul_mat(ctx0, k, q); + + scores = ggml_scale(ctx0, scores, scale); + + scores = ggml_soft_max(ctx0, scores); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores); + + kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3); + kqv = ggml_cont(ctx0, kqv); + + + kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B); + kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B); + kqv = ggml_cont(ctx0, kqv); + + // Output projection + cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1); + + // Residual & Layer Scale + if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) { + if (block.layer_scale_w) { + cur = ggml_mul(ctx0, cur, block.layer_scale_w); + } + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + +ggml_cgraph * clip_graph_mobilenetv5::build() { + ggml_tensor * inp = build_inp_raw(); + + // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2)) + ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding + + cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0 + if (model.mobilenet_stem_conv_b) { + cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b); + } + if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w); + cur = ggml_gelu(ctx0, cur); + + + // 2. Blocks + std::vector<ggml_tensor*> intermediate_features; + const int total_blocks = model.mobilenet_blocks.size(); + + auto is_stage_start = [&](int i) { + if (i == 0) return true; + for (int end_idx : model.mobilenet_stage_ends) { + if (i == end_idx + 1) return true; + } + return false; + }; + + auto is_fusion_point = [&](int i) { + if (model.mobilenet_stage_ends.size() >= 4) { + if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2 + if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3 + } else { + if (i == total_blocks - 1) return true; + } + return false; + }; + + for (int i = 0; i < total_blocks; i++) { + const auto & block = model.mobilenet_blocks[i]; + int stride = is_stage_start(i) ? 2 : 1; + + if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride); + else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block); + else cur = build_inverted_residual(cur, block, stride); + + if (is_fusion_point(i)) { + + intermediate_features.push_back(cur); + } + } + + // 3. Multi-Scale Fusion Adapter (MSFA) + if (!intermediate_features.empty()) { + + // A. Reference Resolution: PyTorch implementation uses inputs[0] + // We assume intermediate_features[0] is the "High Resolution" target. + // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32). + ggml_tensor* target_feat = intermediate_features[0]; + int high_res_w = target_feat->ne[0]; + int high_res_h = target_feat->ne[1]; + + std::vector<ggml_tensor*> resized_feats; + + // B. Resize inputs to match inputs[0] (High Resolution) + for (auto feat : intermediate_features) { + int feat_w = feat->ne[0]; + int feat_h = feat->ne[1]; + + // PyTorch: if feat_size < high_resolution: interpolate + if (feat_w < high_res_w || feat_h < high_res_h) { + // Calculate scale factor. + // Note: PyTorch 'nearest' works on arbitrary float scales. + // ggml_upscale generally takes integer factors or target sizes depending on helper. + // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2). + int scale_w = high_res_w / feat_w; + // int scale_h = high_res_h / feat_h; + + // Safety check for non-integer scaling if strictly replicating + GGML_ASSERT(high_res_w % feat_w == 0); + + // Upsample (Nearest Neighbor) + // 2 is the scale factor + feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); + } + resized_feats.push_back(feat); + } + + // C. Concatenate at High Resolution (Channel Dim = 2 in ggml) + cur = resized_feats[0]; + for (size_t k = 1; k < resized_feats.size(); ++k) { + cur = ggml_concat(ctx0, cur, resized_feats[k], 2); + } + + // D. FFN (UniversalInvertedResidual) + // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm + + // 1. Expansion + if (model.msfa_ffn_expand_w) { + // 1x1 Conv + cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1); + + if (model.msfa_ffn_expand_bn) { + cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn); + } + + cur = ggml_gelu(ctx0, cur); + + } + + // 2. Projection (No DW because kernel_size=0) + if (model.msfa_ffn_project_w) { + // 1x1 Conv + cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1); + + // UniversalInvertedResidual typically has a norm after projection + if (model.msfa_ffn_project_bn) { + cur = rms_norm_2d(cur, model.msfa_ffn_project_bn); + } + + } + + // E. Final Downsample to Target Resolution (Output Resolution) + // PyTorch: matches self.output_resolution (e.g. 16x16) + const int target_out_res = 16; + int current_w = cur->ne[0]; + + if (current_w > target_out_res) { + int s = current_w / target_out_res; + + GGML_ASSERT(current_w % target_out_res == 0); + + // Avg Pool: Kernel=s, Stride=s + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0); + + } + + // F. Final Norm + if (model.msfa_concat_norm_w) { + cur = rms_norm_2d(cur, model.msfa_concat_norm_w); + + } + } + + // 4. Gemma 3n Multimodal Projection (Embedder) + // Input: 'cur' is [Width, Height, Channels, Batch] + int W = cur->ne[0]; + int H = cur->ne[1]; + int C = cur->ne[2]; + int B = cur->ne[3]; + + GGML_ASSERT(C == hparams.n_embd); + + // 1. Permute and Flatten to [Channels, Tokens, Batch] + // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch) + cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B] + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B] + cur = ggml_cont(ctx0, cur); + cur = ggml_reshape_3d(ctx0, cur, C, W*H, B); + cur = ggml_cont(ctx0, cur); + + + // 2. FEATURE SCALING + // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5 + const float scale_factor = sqrtf((float)C); + cur = ggml_scale(ctx0, cur, scale_factor); + + + // 3. SOFT EMBEDDING NORM + // PyTorch: self._norm(x) * self.weight + // We must normalize regardless, then multiply if weight exists. + { + const float eps = 1e-6f; // Gemma3n uses 1e-6 + cur = ggml_rms_norm(ctx0, cur, eps); + + if (model.mm_soft_emb_norm_w) { + // Weight shape is (2048,) -> Element-wise broadcast multiply + cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); + } + + } + + // 4. PROJECTION + // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False) + // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size] + if (model.mm_input_proj_w) { + cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur); + } + + // 5. POST PROJECTION NORM + // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False) + // with_scale=False means weight is registered as buffer with value 1.0 + // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1 + { + const float eps = 1e-6f; + cur = ggml_rms_norm(ctx0, cur, eps); + + if (model.mm_post_proj_norm_w) { + // If weight is loaded, multiply (should be ~1.0 anyway) + cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w); + } + } + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/models.h b/llama.cpp/tools/mtmd/models/models.h new file mode 100644 index 0000000..c4c67ac --- /dev/null +++ b/llama.cpp/tools/mtmd/models/models.h @@ -0,0 +1,118 @@ +#pragma once + +#include "../clip-graph.h" + +/* + * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated. + * We encourage human contributors to ensure the quality and reliability of the codebase. + */ + +struct clip_graph_siglip : clip_graph { + clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_pixtral : clip_graph { + clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_qwen2vl : clip_graph { + clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_qwen3vl : clip_graph { + clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_youtuvl : clip_graph { + clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_minicpmv : clip_graph { + clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_internvl : clip_graph { + clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_llama4 : clip_graph { + clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_kimivl : clip_graph { + clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_cogvlm : clip_graph { + clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_llava : clip_graph { + clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_whisper_enc : clip_graph { + clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_conformer : clip_graph { + clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_glm4v : clip_graph { + clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_mobilenetv5 : clip_graph { + clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; + + ggml_tensor * rms_norm_2d( + ggml_tensor * inp, + ggml_tensor * weight, + float eps = 1e-6f); + + ggml_tensor* pad_same_2d( + ggml_tensor* inp, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int dilation_h = 1, + int dilation_w = 1); + + ggml_tensor * build_edge_residual( + ggml_tensor * inp, + const mobilenetv5_block & block, + int stride); + + ggml_tensor * build_inverted_residual( + ggml_tensor * inp, + const mobilenetv5_block & block, + int stride); + + ggml_tensor * build_mobilenet_attn( + ggml_tensor * inp, + const mobilenetv5_block & block); +}; + +struct clip_graph_kimik25 : clip_graph { + clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; + + ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode); +}; diff --git a/llama.cpp/tools/mtmd/models/pixtral.cpp b/llama.cpp/tools/mtmd/models/pixtral.cpp new file mode 100644 index 0000000..a849210 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/pixtral.cpp @@ -0,0 +1,86 @@ +#include "models.h" + +ggml_cgraph * clip_graph_pixtral::build() { + const int n_merge = hparams.n_merge; + + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_RMS, + hparams.ffn_op, + nullptr, // no learned pos embd + add_pos); + + // mistral small 3.1 patch merger + // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 + if (model.mm_patch_merger_w) { + GGML_ASSERT(hparams.n_merge > 0); + + cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); + + // reshape image tokens to 2D grid + cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] + cur = ggml_cont(ctx0, cur); + + // torch.nn.functional.unfold is just an im2col under the hood + // we just need a dummy kernel to make it work + ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); + cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); + + // project to n_embd + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); + } + + // LlavaMultiModalProjector (always using GELU activation) + { + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + } + + // arrangement of the [IMG_BREAK] token + if (model.token_embd_img_break) { + // not efficient, but works + // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows] + // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension + // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows] + + const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y; + const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x; + const int p_total = p_x * p_y; + const int n_embd_text = cur->ne[0]; + const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row + + ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y); + ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y); + tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor + tok = ggml_add(ctx0, tok, model.token_embd_img_break); + tmp = ggml_concat(ctx0, tmp, tok, 1); + cur = ggml_view_2d(ctx0, tmp, + n_embd_text, n_tokens_output, + ggml_row_size(tmp->type, n_embd_text), 0); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/qwen2vl.cpp b/llama.cpp/tools/mtmd/models/qwen2vl.cpp new file mode 100644 index 0000000..85f158b --- /dev/null +++ b/llama.cpp/tools/mtmd/models/qwen2vl.cpp @@ -0,0 +1,183 @@ +#include "models.h" + +ggml_cgraph * clip_graph_qwen2vl::build() { + GGML_ASSERT(model.patch_bias == nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const bool use_window_attn = hparams.n_wa_pattern > 0; + const int n_wa_pattern = hparams.n_wa_pattern; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL + ? NORM_TYPE_RMS // qwen 2.5 vl + : NORM_TYPE_NORMAL; // qwen 2 vl + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + ggml_tensor * inpL = inp; + ggml_tensor * window_mask = nullptr; + ggml_tensor * window_idx = nullptr; + ggml_tensor * inv_window_idx = nullptr; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + + if (use_window_attn) { + // handle window attention inputs + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + // mask for window attention + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + // if flash attn is used, we need to pad the mask and cast to f16 + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); + } + + // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); + inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + const auto & layer = model.layers[il]; + const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "ln1", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + ggml_tensor * Kcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + ggml_tensor * Vcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // apply M-RoPE + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, attn_mask, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // multimodal projection + ggml_tensor * embeddings = inpL; + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + FFN_GELU, + -1); + + if (use_window_attn) { + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + + // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/qwen3vl.cpp b/llama.cpp/tools/mtmd/models/qwen3vl.cpp new file mode 100644 index 0000000..5ecb10f --- /dev/null +++ b/llama.cpp/tools/mtmd/models/qwen3vl.cpp @@ -0,0 +1,193 @@ +#include "models.h" + +ggml_cgraph * clip_graph_qwen3vl::build() { + GGML_ASSERT(model.patch_bias != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + norm_type norm_t = NORM_TYPE_NORMAL; + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + // add patch bias + if (model.patch_bias != nullptr) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + } + + // calculate absolute position embedding and apply + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + learned_pos_embd = ggml_cont_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + learned_pos_embd = ggml_reshape_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); + learned_pos_embd = ggml_cont_3d( + ctx0, learned_pos_embd, + n_embd, n_patches_x * n_patches_y, batch_size); + inp = ggml_add(ctx0, inp, learned_pos_embd); + cb(inp, "inp_pos_emb", -1); + + ggml_tensor * inpL = inp; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + + // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size] + ggml_tensor * deepstack_features = nullptr; + const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "ln1", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = ggml_add(ctx0, cur, layer.qkv_b); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ 0); + + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, n_embd)); + + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, 2 * n_embd)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // apply M-RoPE + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + if (layer.has_deepstack()) { + ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size); + feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il); + feat = build_ffn(feat, + layer.deepstack_fc1_w, layer.deepstack_fc1_b, + nullptr, nullptr, + layer.deepstack_fc2_w, layer.deepstack_fc2_b, + ffn_op_type::FFN_GELU, il); + + if(!deepstack_features) { + deepstack_features = feat; + } else { + // concat along the feature dimension + deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0); + } + } + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // multimodal projection + ggml_tensor * embeddings = inpL; + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + ffn_op_type::FFN_GELU, -1); + + if (deepstack_features) { + embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); + } // concat along the feature dimension + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/siglip.cpp b/llama.cpp/tools/mtmd/models/siglip.cpp new file mode 100644 index 0000000..b866a11 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/siglip.cpp @@ -0,0 +1,86 @@ +#include "models.h" + +ggml_cgraph * clip_graph_siglip::build() { + ggml_tensor * inp = build_inp(); + + ggml_tensor * learned_pos_embd = model.position_embeddings; + if (proj_type == PROJECTOR_TYPE_LFM2) { + learned_pos_embd = resize_position_embeddings(); + } + + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + nullptr); + + if (proj_type == PROJECTOR_TYPE_GEMMA3) { + const int batch_size = 1; + GGML_ASSERT(n_patches_x == n_patches_y); + const int patches_per_image = n_patches_x; + const int kernel_size = hparams.n_merge; + + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); + + // doing a pool2d to reduce the number of output tokens + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + // apply norm before projection + cur = ggml_rms_norm(ctx0, cur, eps); + cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); + + // apply projection + cur = ggml_mul_mat(ctx0, + ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), + cur); + + } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) { + // pixel_shuffle + // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + cur = ggml_mul_mat(ctx0, model.projection, cur); + + } else if (proj_type == PROJECTOR_TYPE_LFM2) { + // pixel unshuffle block + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection, in LFM2-VL input norm is optional + if (model.mm_input_norm_w) { + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + } + + if (model.mm_input_norm_b) { + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + } + + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + + } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) { + cur = build_ffn(cur, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + hparams.ffn_op, + -1); + + } else { + GGML_ABORT("SigLIP: Unsupported projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/whisper-enc.cpp b/llama.cpp/tools/mtmd/models/whisper-enc.cpp new file mode 100644 index 0000000..2f2b127 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/whisper-enc.cpp @@ -0,0 +1,115 @@ +#include "models.h" + +ggml_cgraph * clip_graph_whisper_enc::build() { + const int n_frames = img.nx; + const int n_pos = n_frames / 2; + GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); + + ggml_tensor * inp = build_inp_raw(1); + + // conv1d block + { + // convolution + gelu + ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1); + cur = ggml_add(ctx0, cur, model.conv1d_1_b); + + cur = ggml_gelu_erf(ctx0, cur); + + cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1); + cur = ggml_add(ctx0, cur, model.conv1d_2_b); + + cur = ggml_gelu_erf(ctx0, cur); + // transpose + inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cb(inp, "after_conv1d", -1); + } + + // sanity check (only check one layer, but it should be the same for all) + GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b); + GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b); + GGML_ASSERT(model.layers[0].q_b); + GGML_ASSERT(model.layers[0].v_b); + GGML_ASSERT(!model.layers[0].k_b); // no bias for k + + ggml_tensor * pos_embd_selected = ggml_view_2d( + ctx0, model.position_embeddings, + model.position_embeddings->ne[0], n_pos, + model.position_embeddings->nb[1], 0 + ); + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + pos_embd_selected, + nullptr); + + cb(cur, "after_transformer", -1); + + if (model.audio_has_stack_frames()) { + // StackAudioFrames + // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py + cur = build_stack(cur, hparams.proj_stack_factor, n_embd); + cb(cur, "after_stacked", -1); + } + + if (proj_type == PROJECTOR_TYPE_ULTRAVOX) { + // UltravoxProjector + // pre-norm + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); + + // ffn in + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + + // swiglu + // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half + cur = ggml_swiglu_swapped(ctx0, cur); + + // mid-norm + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w); + + // ffn out + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + + } else if (proj_type == PROJECTOR_TYPE_QWEN2A) { + // projector + cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); + cur = ggml_add(ctx0, cur, model.mm_fc_b); + + } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) { + // projector + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, + -1); + + } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) { + // projector + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, + -1); + + } else if (proj_type == PROJECTOR_TYPE_GLMA) { + cur = ggml_norm(ctx0, cur, hparams.eps); + cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); + cur = ggml_add(ctx0, cur, model.mm_norm_pre_b); + cur = build_stack(cur, hparams.proj_stack_factor, n_embd); + cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0); + cur = ggml_concat(ctx0, model.mm_boi, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + } else { + GGML_ABORT("%s: unknown projector type", __func__); + } + + cb(cur, "projected", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/youtuvl.cpp b/llama.cpp/tools/mtmd/models/youtuvl.cpp new file mode 100644 index 0000000..ffbf2be --- /dev/null +++ b/llama.cpp/tools/mtmd/models/youtuvl.cpp @@ -0,0 +1,179 @@ +#include "models.h" + +ggml_cgraph * clip_graph_youtuvl::build() { + GGML_ASSERT(model.class_embedding == nullptr); + const int batch_size = 1; + const bool use_window_attn = !hparams.wa_layer_indexes.empty(); + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; + const int m = 2; + const int Wp = n_patches_x; + const int Hp = n_patches_y; + const int Hm = Hp / m; + const int Wm = Wp / m; + norm_type norm_t = NORM_TYPE_NORMAL; + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp = build_inp_raw(); + + // change conv3d to linear + // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm) + { + inp = ggml_reshape_4d( + ctx0, inp, + Wm * m * patch_size, m * patch_size, Hm, 3); + inp = ggml_permute(ctx0, inp, 1, 2, 3, 0); + inp = ggml_cont_4d( + ctx0, inp, + m * patch_size * 3, Wm, m * patch_size, Hm); + + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_4d( + ctx0, inp, + m * patch_size * 3, patch_size, m, Hm * Wm); + + inp = ggml_permute(ctx0, inp, 1, 0, 2, 3); + inp = ggml_cont_4d( + ctx0, inp, + patch_size, 3, patch_size, Hm * Wm * m * m); + + inp = ggml_permute(ctx0, inp, 2, 0, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + 3*patch_size* patch_size, Hm * Wm * m * m, 1); + } + inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); + + if (model.patch_bias) { + inp = ggml_add(ctx0, inp, model.patch_bias); + } + + inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); + + ggml_tensor * inpL = inp; + ggml_tensor * window_mask = nullptr; + ggml_tensor * window_idx = nullptr; + ggml_tensor * inv_window_idx = nullptr; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + if (use_window_attn) { + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + // mask for window attention + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + // if flash attn is used, we need to pad the mask and cast to f16 + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); + } + + // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); + inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + const auto & layer = model.layers[il]; + const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + // self-attention + { + ggml_tensor * Qcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + ggml_tensor * Kcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + ggml_tensor * Vcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, attn_mask, kq_scale, il); + } + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + nullptr, nullptr, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + + inpL = cur; + } + + ggml_tensor * embeddings = inpL; + if (use_window_attn) { + const int spatial_merge_unit = 4; + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size); + cb(embeddings, "window_order_restored", -1); + } + + // post-layernorm (part of Siglip2VisionTransformer, applied after encoder) + if (model.post_ln_w) { + embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // Now apply merger (VLPatchMerger): + // 1. Apply RMS norm (ln_q in VLPatchMerger) + embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1); + cb(embeddings, "merger_normed", -1); + + // 2. First reshape for spatial merge (merge 2x2 patches) + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + cb(embeddings, "merger_reshaped", -1); + + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + FFN_GELU, + -1); + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/mtmd-audio.cpp b/llama.cpp/tools/mtmd/mtmd-audio.cpp new file mode 100644 index 0000000..e8eef03 --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd-audio.cpp @@ -0,0 +1,730 @@ +#include "mtmd-audio.h" + +#define _USE_MATH_DEFINES // for M_PI +#include <cmath> +#include <cstdint> +#include <cstring> +#include <thread> +#include <vector> +#include <fstream> +#include <algorithm> + +// some of the code here is copied from whisper.cpp + +constexpr bool DEBUG = false; + +void mtmd_audio_cache::fill_sin_cos_table(int n) { + sin_vals.resize(n); + cos_vals.resize(n); + for (int i = 0; i < n; i++) { + double theta = (2 * M_PI * i) / n; + sin_vals[i] = sinf(theta); + cos_vals[i] = cosf(theta); + } +} + +void mtmd_audio_cache::fill_hann_window(int length, bool periodic) { + hann_window.resize(length); + int offset = -1; + if (periodic) { + offset = 0; + } + for (int i = 0; i < length; i++) { + hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); + } +} + +void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel, + int n_fft, + int sample_rate, + float fmin, + float fmax, + bool slaney_area_norm, + float scale) { + GGML_ASSERT(n_mel > 0 && n_fft > 1); + if (fmax <= 0.0f) { + fmax = 0.5f * sample_rate; + } + + // Slaney scale (matches librosa default) + const double min_log_hz = 1000.0; + const double lin_slope = 3 / 200.; + const double min_log_mel = min_log_hz * lin_slope; + const double log_step = log(6.4) / 27.0; + auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double { + return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step; + }; + auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double { + return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step); + }; + + // infer N_fft from n_fft_bins + const double bin_hz_step = double(sample_rate) / double(n_fft); + + // mel grid: n_mel + 2 edges + const double m_lo = hz_to_mel(fmin); + const double m_hi = hz_to_mel(fmax); + std::vector<double> mel_pts(n_mel + 2); + for (int i = 0; i < n_mel + 2; ++i) { + mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1)); + } + + // convert to Hz + std::vector<double> hz_pts(n_mel + 2); + for (int i = 0; i < n_mel + 2; ++i) { + hz_pts[i] = mel_to_hz(mel_pts[i]); + } + + const int n_fft_bins = n_fft / 2 + 1; + + // filterbank + std::vector<float> out(n_mel * n_fft_bins, 0); + for (int m = 0; m < n_mel; ++m) { + const double f_left = hz_pts[m]; + const double f_center = hz_pts[m + 1]; + const double f_right = hz_pts[m + 2]; + + const double denom_l = std::max(1e-30, f_center - f_left); + const double denom_r = std::max(1e-30, f_right - f_center); + const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0; + + for (int k = 0; k < n_fft_bins; ++k) { + const double f = k * bin_hz_step; + double w = 0.0; + if (f >= f_left && f <= f_center) { + w = (f - f_left) / denom_l; + } else if (f > f_center && f <= f_right) { + w = (f_right - f) / denom_r; + } + out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale); + } + } + + filters.n_mel = n_mel; + filters.n_fft = n_fft; + filters.data = std::move(out); + + if (DEBUG) { // debug + for (size_t i = 0; i < filters.data.size(); ++i) { + if (filters.data[i] != 0.0f) { + printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f); + } + } + } +} + +// Unified DFT implementation for both forward and inverse transforms +// Template parameters: +// Inverse: false = DFT with exp(-2πi·k·n/N), no scaling +// true = IDFT with exp(+2πi·k·n/N), scales by 1/N +// RealInput: true = input is real-valued (stride 1), avoids imaginary computations +// false = input is complex-valued (interleaved real/imag, stride 2) +template <bool Inverse, bool RealInput> +static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) { + const int n_sin_cos_vals = cache.sin_vals.size(); + const int sin_cos_step = n_sin_cos_vals / N; + + constexpr float sign = Inverse ? 1.0f : -1.0f; + const float scale = Inverse ? (1.0f / N) : 1.0f; + + for (int k = 0; k < N; k++) { + float re = 0; + float im = 0; + + for (int n = 0; n < N; n++) { + int idx = (k * n * sin_cos_step) % n_sin_cos_vals; + float cos_val = cache.cos_vals[idx]; + float sin_val = cache.sin_vals[idx]; + + if constexpr (RealInput) { + // Real input: in_im = 0, simplifies to: + // re += in_re * cos_val + // im += sign * in_re * sin_val + float in_re = in[n]; + re += in_re * cos_val; + im += sign * in_re * sin_val; + } else { + float in_re = in[n * 2 + 0]; + float in_im = in[n * 2 + 1]; + // (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i + re += in_re * cos_val - sign * in_im * sin_val; + im += sign * in_re * sin_val + in_im * cos_val; + } + } + + out[k * 2 + 0] = re * scale; + out[k * 2 + 1] = im * scale; + } +} + +// Cooley-Tukey FFT/IFFT unified implementation +// Template parameters: +// Inverse: false = FFT with exp(-2πi·k/N), no scaling +// true = IFFT with exp(+2πi·k/N), scales by 0.5 at each level +// RealInput: true = input is real-valued (stride 1) +// false = input is complex-valued (interleaved real/imag, stride 2) +template <bool Inverse, bool RealInput> +static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) { + const int n_sin_cos_vals = cache.sin_vals.size(); + + if (N == 1) { + out[0] = in[0]; + if constexpr (RealInput) { + out[1] = 0.0f; + } else { + out[1] = in[1]; + } + return; + } + + const int half_N = N / 2; + if (N - half_N * 2 == 1) { + // Odd N: fall back to DFT + dft_impl<Inverse, RealInput>(cache, in, N, out); + return; + } + + // Split into even and odd + if constexpr (RealInput) { + // Real input: stride is 1, copy only real values + float * even = in + N; + for (int i = 0; i < half_N; ++i) { + even[i] = in[2 * i]; + } + float * even_fft = out + 2 * N; + fft_impl<Inverse, true>(cache, even, half_N, even_fft); + + float * odd = even; + for (int i = 0; i < half_N; ++i) { + odd[i] = in[2 * i + 1]; + } + float * odd_fft = even_fft + N; + fft_impl<Inverse, true>(cache, odd, half_N, odd_fft); + } else { + // Complex input: stride is 2, copy complex pairs + float * even = in + N * 2; + for (int i = 0; i < half_N; ++i) { + even[i * 2 + 0] = in[2 * i * 2 + 0]; + even[i * 2 + 1] = in[2 * i * 2 + 1]; + } + float * even_fft = out + 2 * N; + fft_impl<Inverse, false>(cache, even, half_N, even_fft); + + float * odd = even; + for (int i = 0; i < half_N; ++i) { + odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0]; + odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1]; + } + float * odd_fft = even_fft + N; + fft_impl<Inverse, false>(cache, odd, half_N, odd_fft); + } + + float * even_fft = out + 2 * N; + float * odd_fft = even_fft + N; + + const int sin_cos_step = n_sin_cos_vals / N; + + constexpr float sign = Inverse ? 1.0f : -1.0f; + constexpr float scale = Inverse ? 0.5f : 1.0f; + + for (int k = 0; k < half_N; k++) { + int idx = k * sin_cos_step; // t = 2*M_PI*k/N + float re = cache.cos_vals[idx]; + float im = sign * cache.sin_vals[idx]; + + float re_odd = odd_fft[2 * k + 0]; + float im_odd = odd_fft[2 * k + 1]; + + out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd); + out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd); + + out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd); + out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd); + } +} + +// Forward FFT for real input (used by mel spectrogram) +static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) { + fft_impl<false, true>(cache, in, N, out); +} + +// Inverse FFT for complex input +static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) { + fft_impl<true, false>(cache, in, N, out); +} + +struct filter_params { + int32_t n_mel; + int32_t n_fft_bins; + int32_t hann_window_size; + int32_t hop_length; + int32_t sample_rate; + bool center_padding = false; + float preemph = 0.f; + bool use_natural_log = false; + bool norm_per_feature = false; +}; + +static void log_mel_spectrogram_worker_thread(int ith, + const float * hann, + const std::vector<float> & samples, + int n_samples, + int frame_size, + int frame_step, + int n_threads, + const filter_params & params, + const mtmd_audio_cache & cache, + mtmd_audio_mel & out) { + std::vector<float> fft_in(frame_size * 2, 0.0); + std::vector<float> fft_out(frame_size * 2 * 2 * 2); + + int n_fft_bins = params.n_fft_bins; + int i = ith; + + const auto & filters = cache.filters; + + // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist + GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2)); + GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size()); + // calculate FFT only when fft_in are not all zero + for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) { + const int offset = i * frame_step; + + // apply Hann window (~10% faster) + for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) { + fft_in[j] = hann[j] * samples[offset + j]; + } + + // fill the rest with zeros + if (n_samples - offset < frame_size) { + std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); + } + + // FFT + fft(cache, fft_in.data(), frame_size, fft_out.data()); + + // Calculate modulus^2 of complex numbers + // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. + for (int j = 0; j < n_fft_bins; j++) { + fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); + } + + // mel spectrogram + for (int j = 0; j < out.n_mel; j++) { + double sum = 0.0; + // unroll loop (suggested by GH user @lunixbochs) + int k = 0; + for (k = 0; k < n_fft_bins - 3; k += 4) { + size_t idx = size_t(j) * size_t(n_fft_bins) + size_t(k); + sum += + fft_out[k + 0] * filters.data[idx + 0] + + fft_out[k + 1] * filters.data[idx + 1] + + fft_out[k + 2] * filters.data[idx + 2] + + fft_out[k + 3] * filters.data[idx + 3]; + } + // handle n_fft remainder + for (; k < n_fft_bins; k++) { + sum += fft_out[k] * filters.data[j * n_fft_bins + k]; + } + sum = params.use_natural_log + ? log(sum + 5.960464477539063e-08) + : log10(std::max(sum, 1e-10)); + out.data[j * out.n_len + i] = sum; + } + } + + // Otherwise fft_out are all zero + double sum = params.use_natural_log ? log(1e-10) : log10(1e-10); + for (; i < out.n_len; i += n_threads) { + for (int j = 0; j < out.n_mel; j++) { + out.data[j * out.n_len + i] = sum; + } + } +} + +// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157 +static bool log_mel_spectrogram( + const float * samples, + const int n_samples_in, + const int n_threads, + const filter_params & params, + const mtmd_audio_cache & cache, + mtmd_audio_mel & out) { + //const int64_t t_start_us = ggml_time_us(); + + out.n_len_org = n_samples_in; + int n_samples = n_samples_in; + + // Hann window + const float * hann = cache.hann_window.data(); + const int frame_size = (params.n_fft_bins - 1) * 2; + const int frame_step = params.hop_length; + + // Padding + std::vector<float> samples_padded; + if (params.center_padding) { + const auto pad_amount = frame_size / 2; + samples_padded = std::vector<float>(n_samples + 2 * pad_amount, 0); + std::copy(samples, samples + n_samples, samples_padded.data() + pad_amount); + samples = samples_padded.data(); + n_samples = samples_padded.size(); + } else { + // existing padding logic + int64_t stage_1_pad = params.sample_rate * 30; + int64_t stage_2_pad = frame_size / 2; + samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); + std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); + // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio + std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0); + // reflective pad 200 samples at the beginning of audio + if (n_samples < stage_2_pad + 1) { + // TODO: Handle short audio differently or return error + return false; + } + std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); + } + + // preemphasis + if (params.preemph) { + const int pad_amount = frame_size / 2; + const float preemph = 0.97f; + float prev = samples_padded[pad_amount]; + for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) { + float cur = samples_padded[i]; + samples_padded[i] = cur - preemph * prev; + prev = cur; + } + } + + // pad hann window if it's smaller than frame_size + // TODO: probably unnecessary here? (or better doing it in g_cache?) + std::vector<float> hann_window_padded; + if (params.hann_window_size < frame_size) { + hann_window_padded.resize(frame_size); + const int padding = (frame_size - params.hann_window_size) / 2; + std::copy(hann, hann + params.hann_window_size, &hann_window_padded[padding]); + hann = hann_window_padded.data(); + } + + + out.n_mel = params.n_mel; + out.n_len = (n_samples - frame_size) / frame_step + 1; + // TODO: handle these checks better + if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) { + LOG_ERR("%s: size overflow\n", __func__); + return false; + } + if (n_samples < frame_size) { + LOG_ERR("%s: not enough samples after padding\n", __func__); + return false; + } + out.data.resize(out.n_mel * out.n_len); + + { + std::vector<std::thread> workers(n_threads - 1); + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw] = + std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples, + frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out)); + } + + // main thread + log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, + cache, out); + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw].join(); + } + } + + const int effective_n_len = n_samples_in / frame_step; + if (params.norm_per_feature) { + for (int i = 0; i < out.n_mel; i++) { + double mean = 0; + for (int j = 0; j < effective_n_len; ++j) { + mean += out.data[i * out.n_len + j]; + } + mean /= effective_n_len; + + double var = 0.0; + for (int j = 0; j < effective_n_len; ++j) { + const double value = out.data[i * out.n_len + j] - mean; + var += value * value; + } + var /= effective_n_len - 1; // unbiased + const double mstd = std::sqrt(var + 1e-5); + + for (int j = 0; j < effective_n_len; ++j) { + auto &value = out.data[i * out.n_len + j]; + value = (value - mean) / mstd; + } + + // pad the rest with zeros + for (int j = effective_n_len; j < out.n_len; ++j) { + out.data[i * out.n_len + j] = 0.0; + } + } + } else { + // clamping and normalization + double mmax = -1e20; + for (int i = 0; i < out.n_mel*out.n_len; i++) { + if (out.data[i] > mmax) { + mmax = out.data[i]; + } + } + + mmax -= 8.0; + + for (int i = 0; i < out.n_mel*out.n_len; i++) { + if (out.data[i] < mmax) { + out.data[i] = mmax; + } + out.data[i] = (out.data[i] + 4.0)/4.0; + } + } + + // Dump log_mel_spectrogram + if (DEBUG) { + std::ofstream outFile("log_mel_spectrogram.json"); + outFile << "["; + for (uint64_t i = 0; i < out.data.size() - 1; i++) { + outFile << out.data[i] << ", "; + } + outFile << out.data[out.data.size() - 1] << "]"; + outFile.close(); + } + + return true; +} + +// +// mtmd_audio_preprocessor_whisper +// + +void mtmd_audio_preprocessor_whisper::initialize() { + cache.fill_sin_cos_table(hparams.audio_n_fft); + cache.fill_hann_window(hparams.audio_window_len, true); + cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate); +} + +bool mtmd_audio_preprocessor_whisper::preprocess(const float * samples, + size_t n_samples, + std::vector<mtmd_audio_mel> & output) { + if (n_samples == 0) { + // empty audio + return false; + } + + std::vector<float> smpl; + // if input is too short, pad with zeros + // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram + // TODO: maybe handle this better + size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin + if (n_samples < min_samples) { + smpl.resize(min_samples, 0.0f); + std::memcpy(smpl.data(), samples, n_samples * sizeof(float)); + samples = smpl.data(); + n_samples = smpl.size(); + } + + filter_params params; + params.n_mel = hparams.n_mel_bins; + params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); + params.hann_window_size = hparams.audio_window_len; + params.hop_length = hparams.audio_hop_len; + params.sample_rate = hparams.audio_sample_rate; + params.center_padding = false; + params.preemph = 0.0f; // disabled + params.use_natural_log = false; + params.norm_per_feature = false; + + // make sure the cache is initialized + GGML_ASSERT(!cache.sin_vals.empty()); + GGML_ASSERT(!cache.cos_vals.empty()); + GGML_ASSERT(!cache.filters.data.empty()); + + mtmd_audio_mel out_full; + bool ok = log_mel_spectrogram(samples, n_samples, + 4, // n_threads + params, cache, out_full); + if (!ok) { + return false; + } + + // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel + // we always expect the mel to have 3000 silent frames at the end + if (DEBUG) { + printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len); + } + const size_t frames_per_chunk = 3000; + GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk); + for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) { + int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off); + if ((size_t) n_len < frames_per_chunk) { + break; // last uncomplete chunk will always be a padded chunk, safe to ignore + } + + mtmd_audio_mel out_chunk; + out_chunk.n_len = n_len; + out_chunk.n_mel = out_full.n_mel; + out_chunk.n_len_org = out_full.n_mel; // unused + out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len); + + for (int i = 0; i < out_full.n_mel; i++) { + auto src = out_full.data.begin() + i * out_full.n_len + off; + out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk); + } + + output.push_back(std::move(out_chunk)); + } + + return true; +} + +// +// mtmd_audio_preprocessor_conformer +// + +void mtmd_audio_preprocessor_conformer::initialize() { + cache.fill_sin_cos_table(hparams.audio_n_fft); + cache.fill_hann_window(hparams.audio_window_len, true); + cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate); +} + +bool mtmd_audio_preprocessor_conformer::preprocess(const float * samples, + size_t n_samples, + std::vector<mtmd_audio_mel> & output) { + // empty audio + if (n_samples == 0) { + return false; + } + + filter_params params; + params.n_mel = hparams.n_mel_bins; + params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); + params.hann_window_size = hparams.audio_window_len; + params.hop_length = hparams.audio_hop_len; + params.sample_rate = hparams.audio_sample_rate; + params.center_padding = true; + params.preemph = 0.97f; + params.use_natural_log = true; + params.norm_per_feature = true; + + // make sure the cache is initialized + GGML_ASSERT(!cache.sin_vals.empty()); + GGML_ASSERT(!cache.cos_vals.empty()); + GGML_ASSERT(!cache.filters.data.empty()); + + mtmd_audio_mel out_full; + bool ok = log_mel_spectrogram(samples, n_samples, + 4, // n_threads + params, cache, out_full); + if (!ok) { + return false; + } + + output.push_back(std::move(out_full)); + return true; +} + +// +// mtmd_audio_streaming_istft implementation +// + +mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) : + n_fft(n_fft), + hop_length(hop_length), + n_fft_bins(n_fft / 2 + 1), + overlap_buffer(n_fft, 0.0f), + window_sum_buffer(n_fft, 0.0f), + padding_to_remove((n_fft - hop_length) / 2), + ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT + ifft_out(n_fft * 2 * 4, 0.0f) { + cache.fill_sin_cos_table(n_fft); + cache.fill_hann_window(n_fft, true); +} + +void mtmd_audio_streaming_istft::reset() { + std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f); + std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f); + padding_to_remove = (n_fft - hop_length) / 2; +} + +std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) { + std::vector<float> output(hop_length); + + // copy frequencies + for (int j = 0; j < n_fft_bins; j++) { + ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0]; + ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1]; + } + + // mirror negative frequencies + for (int j = 1; j < n_fft_bins - 1; j++) { + int mirror_idx = n_fft - j; + ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0]; + ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1]; // conjugate + } + + ifft(cache, ifft_in.data(), n_fft, ifft_out.data()); + + // update window sum and overlap buffer + for (int j = 0; j < n_fft; j++) { + window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j]; + overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j]; + } + + // extract hop_length samples with normalization + for (int i = 0; i < hop_length; i++) { + if (window_sum_buffer[i] > 1e-8f) { + output[i] = overlap_buffer[i] / window_sum_buffer[i]; + } else { + output[i] = overlap_buffer[i]; + } + } + + // shift buffers left by hop_length + std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin()); + std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f); + + std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin()); + std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f); + + // Remove padding if needed + int to_remove = std::min(padding_to_remove, (int) output.size()); + padding_to_remove -= to_remove; + output.erase(output.begin(), output.begin() + to_remove); + + return output; +} + +std::vector<float> mtmd_audio_streaming_istft::flush() { + std::vector<float> output; + + // Extract remaining samples from overlap buffer + // Continue until we've extracted all meaningful samples + int remaining = n_fft - hop_length; + while (remaining > 0) { + int chunk_size = std::min(remaining, hop_length); + + for (int i = 0; i < chunk_size; i++) { + float sample; + if (window_sum_buffer[i] > 1e-8f) { + sample = overlap_buffer[i] / window_sum_buffer[i]; + } else { + sample = overlap_buffer[i]; + } + output.push_back(sample); + } + + // Shift buffers + std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin()); + std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f); + + std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin()); + std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f); + + remaining -= chunk_size; + } + + return output; +} diff --git a/llama.cpp/tools/mtmd/mtmd-audio.h b/llama.cpp/tools/mtmd/mtmd-audio.h new file mode 100644 index 0000000..016c739 --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd-audio.h @@ -0,0 +1,113 @@ +#pragma once + +#include "ggml.h" +#include "clip-model.h" + +#include <cstdint> +#include <vector> +#include <string> + +#define MTMD_INTERNAL_HEADER + +struct mtmd_audio_mel { + int n_len; + int n_len_org; + int n_mel; + + std::vector<float> data; +}; + +struct mtmd_audio_mel_filters { + int32_t n_mel; + int32_t n_fft; + + std::vector<float> data; +}; + +// cache for audio processing, each processor instance owns its own cache +struct mtmd_audio_cache { + std::vector<float> sin_vals; + std::vector<float> cos_vals; + + std::vector<float> hann_window; + + mtmd_audio_mel_filters filters; + + void fill_sin_cos_table(int n); + + void fill_hann_window(int length, bool periodic); + + // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime. + // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257. + void fill_mel_filterbank_matrix(int n_mel, + int n_fft, + int sample_rate, // e.g. 16000 + float fmin = 0.0f, // e.g. 0.0 + float fmax = -1.0f, // e.g. sr/2; pass -1 for auto + bool slaney_area_norm = true, + float scale = 1.0f // optional extra scaling + ); +}; + +struct mtmd_audio_preprocessor { + const clip_hparams & hparams; + + mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {} + + virtual ~mtmd_audio_preprocessor() = default; + virtual void initialize() = 0; // NOT thread-safe + virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0; +}; + +struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override; + + private: + mtmd_audio_cache cache; +}; + +struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override; + + private: + mtmd_audio_cache cache; +}; + +// +// streaming ISTFT - converts spectrogram frames back to audio one frame at a time +// +struct mtmd_audio_streaming_istft { + mtmd_audio_streaming_istft(int n_fft, int hop_length); + + // reset streaming state + void reset(); + + // process a single STFT frame (streaming) + // frame_spectrum: [n_fft_bins x 2] interleaved real/imag + // returns: up to hop_length samples + std::vector<float> process_frame(const float * frame_spectrum); + + // flush remaining samples at end of stream + std::vector<float> flush(); + + private: + int n_fft; + int hop_length; + int n_fft_bins; + + // Own cache for output processing + mtmd_audio_cache cache; + + // Streaming state + std::vector<float> overlap_buffer; + std::vector<float> window_sum_buffer; + int padding_to_remove; + + // Working buffers for IFFT + std::vector<float> ifft_in; + std::vector<float> ifft_out; +}; diff --git a/llama.cpp/tools/mtmd/mtmd-cli.cpp b/llama.cpp/tools/mtmd/mtmd-cli.cpp new file mode 100644 index 0000000..054c7fa --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd-cli.cpp @@ -0,0 +1,437 @@ +#include "arg.h" +#include "debug.h" +#include "log.h" +#include "common.h" +#include "sampling.h" +#include "llama.h" +#include "ggml.h" +#include "console.h" +#include "chat.h" +#include "mtmd.h" +#include "mtmd-helper.h" + +#include <vector> +#include <limits.h> +#include <cinttypes> + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include <signal.h> +#include <unistd.h> +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include <windows.h> +#include <signal.h> +#endif + +// volatile, because of signal being an interrupt +static volatile bool g_is_generating = false; +static volatile bool g_is_interrupted = false; + +/** + * Please note that this is NOT a production-ready stuff. + * It is a playground for trying multimodal support in llama.cpp. + * For contributors: please keep this code simple and easy to understand. + */ + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG( + "Experimental CLI for multimodal\n\n" + "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n" + " -m and --mmproj are required\n" + " -hf user/repo can replace both -m and --mmproj in most cases\n" + " --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n" + " to disable using GPU for mmproj model, add --no-mmproj-offload\n", + argv[0] + ); +} + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +static void sigint_handler(int signo) { + if (signo == SIGINT) { + if (g_is_generating) { + g_is_generating = false; + } else { + console::cleanup(); + if (g_is_interrupted) { + _exit(1); + } + g_is_interrupted = true; + } + } +} +#endif + +struct mtmd_cli_context { + mtmd::context_ptr ctx_vision; + common_init_result_ptr llama_init; + + llama_model * model; + llama_context * lctx; + const llama_vocab * vocab; + common_sampler * smpl; + llama_batch batch; + int n_batch; + + mtmd::bitmaps bitmaps; + + // chat template + common_chat_templates_ptr tmpls; + std::vector<common_chat_msg> chat_history; + bool use_jinja = false; + // TODO: support for --system-prompt with /clear command + + // support for legacy templates (models not having EOT token) + llama_tokens antiprompt_tokens; + + int n_threads = 1; + llama_pos n_past = 0; + + base_callback_data cb_data; + + mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) { + model = llama_init->model(); + lctx = llama_init->context(); + vocab = llama_model_get_vocab(model); + smpl = common_sampler_init(model, params.sampling); + n_threads = params.cpuparams.n_threads; + batch = llama_batch_init(1, 0, 1); // batch for next token generation + n_batch = params.n_batch; + + if (!model || !lctx) { + exit(1); + } + + if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) { + LOG_ERR("Model does not have chat template.\n"); + LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n"); + LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n"); + LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n"); + exit(1); + } + + tmpls = common_chat_templates_init(model, params.chat_template); + use_jinja = params.use_jinja; + chat_history.clear(); + LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str()); + + init_vision_context(params); + + // load antiprompt tokens for legacy templates + if (params.chat_template == "vicuna") { + antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true); + } else if (params.chat_template == "deepseek") { + antiprompt_tokens = common_tokenize(lctx, "###", false, true); + } + } + + ~mtmd_cli_context() { + llama_batch_free(batch); + common_sampler_free(smpl); + } + + void init_vision_context(common_params & params) { + const char * clip_path = params.mmproj.path.c_str(); + mtmd_context_params mparams = mtmd_context_params_default(); + mparams.use_gpu = params.mmproj_use_gpu; + mparams.print_timings = true; + mparams.n_threads = params.cpuparams.n_threads; + mparams.flash_attn_type = params.flash_attn_type; + mparams.warmup = params.warmup; + mparams.image_min_tokens = params.image_min_tokens; + mparams.image_max_tokens = params.image_max_tokens; + if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) { + mparams.cb_eval_user_data = &cb_data; + mparams.cb_eval = common_debug_cb_eval<false>; + } + ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams)); + if (!ctx_vision.get()) { + LOG_ERR("Failed to load vision model from %s\n", clip_path); + exit(1); + } + } + + bool check_antiprompt(const llama_tokens & generated_tokens) { + if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) { + return false; + } + return std::equal( + generated_tokens.end() - antiprompt_tokens.size(), + generated_tokens.end(), + antiprompt_tokens.begin() + ); + } + + bool load_media(const std::string & fname) { + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str())); + if (!bmp.ptr) { + return false; + } + bitmaps.entries.push_back(std::move(bmp)); + return true; + } +}; + +static int generate_response(mtmd_cli_context & ctx, int n_predict) { + llama_tokens generated_tokens; + for (int i = 0; i < n_predict; i++) { + if (i > n_predict || !g_is_generating || g_is_interrupted) { + LOG("\n"); + break; + } + + llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, -1); + generated_tokens.push_back(token_id); + common_sampler_accept(ctx.smpl, token_id, true); + + if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) { + LOG("\n"); + break; // end of generation + } + + LOG("%s", common_token_to_piece(ctx.lctx, token_id).c_str()); + fflush(stdout); + + if (g_is_interrupted) { + LOG("\n"); + break; + } + + // eval the token + common_batch_clear(ctx.batch); + common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true); + if (llama_decode(ctx.lctx, ctx.batch)) { + LOG_ERR("failed to decode token\n"); + return 1; + } + } + + std::string generated_text = common_detokenize(ctx.lctx, generated_tokens); + common_chat_msg msg; + msg.role = "assistant"; + msg.content = generated_text; + ctx.chat_history.push_back(std::move(msg)); + + return 0; +} + +static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) { + LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n", + new_msg.role.c_str(), new_msg.content.c_str()); + auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history, + new_msg, new_msg.role == "user", + ctx.use_jinja); + ctx.chat_history.push_back(new_msg); + return formatted; +} + +static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { + bool add_bos = ctx.chat_history.empty(); + auto formatted_chat = chat_add_and_format(ctx, msg); + LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); + + mtmd_input_text text; + text.text = formatted_chat.c_str(); + text.add_special = add_bos; + text.parse_special = true; + + if (g_is_interrupted) return 0; + + mtmd::input_chunks chunks(mtmd_input_chunks_init()); + auto bitmaps_c_ptr = ctx.bitmaps.c_ptr(); + int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), + chunks.ptr.get(), // output + &text, // text + bitmaps_c_ptr.data(), + bitmaps_c_ptr.size()); + if (res != 0) { + LOG_ERR("Unable to tokenize prompt, res = %d\n", res); + return 1; + } + + ctx.bitmaps.entries.clear(); + + llama_pos new_n_past; + if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(), + ctx.lctx, // lctx + chunks.ptr.get(), // chunks + ctx.n_past, // n_past + 0, // seq_id + ctx.n_batch, // n_batch + true, // logits_last + &new_n_past)) { + LOG_ERR("Unable to eval prompt\n"); + return 1; + } + + ctx.n_past = new_n_past; + + LOG("\n"); + + return 0; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) { + return 1; + } + + common_init(); + mtmd_helper_log_set(common_log_default_callback, nullptr); + + if (params.mmproj.path.empty()) { + show_additional_info(argc, argv); + LOG_ERR("ERR: Missing --mmproj argument\n"); + return 1; + } + + mtmd_cli_context ctx(params); + LOG_INF("%s: loading model: %s\n", __func__, params.model.path.c_str()); + + bool is_single_turn = !params.prompt.empty() && !params.image.empty(); + + int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict; + + // Ctrl+C handling + { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true); +#endif + } + + if (g_is_interrupted) return 130; + + auto eval_system_prompt_if_present = [&] { + if (params.system_prompt.empty()) { + return 0; + } + + common_chat_msg msg; + msg.role = "system"; + msg.content = params.system_prompt; + return eval_message(ctx, msg); + }; + + LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n"); + LOG_WRN(" For normal use cases, please use the standard llama-cli\n"); + + if (eval_system_prompt_if_present()) { + return 1; + } + + if (is_single_turn) { + g_is_generating = true; + if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { + for (size_t i = 0; i < params.image.size(); i++) { + // most models require the marker before each image + // ref: https://github.com/ggml-org/llama.cpp/pull/17616 + params.prompt = mtmd_default_marker() + params.prompt; + } + } + + common_chat_msg msg; + msg.role = "user"; + msg.content = params.prompt; + for (const auto & image : params.image) { + if (!ctx.load_media(image)) { + return 1; // error is already printed by libmtmd + } + } + if (eval_message(ctx, msg)) { + return 1; + } + if (!g_is_interrupted && generate_response(ctx, n_predict)) { + return 1; + } + + } else { + LOG("\n Running in chat mode, available commands:"); + if (mtmd_support_vision(ctx.ctx_vision.get())) { + LOG("\n /image <path> load an image"); + } + if (mtmd_support_audio(ctx.ctx_vision.get())) { + LOG("\n /audio <path> load an audio"); + } + LOG("\n /clear clear the chat history"); + LOG("\n /quit or /exit exit the program"); + LOG("\n"); + + std::string content; + + while (!g_is_interrupted) { + g_is_generating = false; + LOG("\n> "); + console::set_display(DISPLAY_TYPE_USER_INPUT); + std::string line; + console::readline(line, false); + if (g_is_interrupted) break; + console::set_display(DISPLAY_TYPE_RESET); + line = string_strip(line); + if (line.empty()) { + continue; + } + if (line == "/quit" || line == "/exit") { + break; + } + if (line == "/clear") { + ctx.n_past = 0; + ctx.chat_history.clear(); + llama_memory_clear(llama_get_memory(ctx.lctx), true); + if (eval_system_prompt_if_present()) { + return 1; + } + LOG("Chat history cleared\n\n"); + continue; + } + g_is_generating = true; + bool is_image = line == "/image" || line.find("/image ") == 0; + bool is_audio = line == "/audio" || line.find("/audio ") == 0; + if (is_image || is_audio) { + if (line.size() < 8) { + LOG_ERR("ERR: Missing media filename\n"); + continue; + } + std::string media_path = line.substr(7); + if (ctx.load_media(media_path)) { + LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio"); + content += mtmd_default_marker(); + } + // else, error is already printed by libmtmd + continue; + } else { + content += line; + } + common_chat_msg msg; + msg.role = "user"; + msg.content = content; + int ret = eval_message(ctx, msg); + if (ret) { + return 1; + } + if (g_is_interrupted) break; + if (generate_response(ctx, n_predict)) { + return 1; + } + content.clear(); + } + } + if (g_is_interrupted) LOG("\nInterrupted by user\n"); + LOG("\n\n"); + llama_perf_context_print(ctx.lctx); + return g_is_interrupted ? 130 : 0; +} diff --git a/llama.cpp/tools/mtmd/mtmd-helper.cpp b/llama.cpp/tools/mtmd/mtmd-helper.cpp new file mode 100644 index 0000000..902a4b4 --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd-helper.cpp @@ -0,0 +1,521 @@ +// fix problem with std::min and std::max +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include <windows.h> +#endif + +#include "mtmd.h" +#include "mtmd-helper.h" +#include "llama.h" + +#include <algorithm> +#include <cinttypes> +#include <vector> + +//#define MTMD_AUDIO_DEBUG + +#define MINIAUDIO_IMPLEMENTATION +#ifndef MTMD_AUDIO_DEBUG +# define MA_NO_ENCODING +#endif +#define MA_NO_DEVICE_IO +#define MA_NO_RESOURCE_MANAGER +#define MA_NO_NODE_GRAPH +#define MA_NO_ENGINE +#define MA_NO_GENERATION +#define MA_API static +#include "miniaudio/miniaudio.h" + +#define STB_IMAGE_IMPLEMENTATION +#include "stb/stb_image.h" + +#ifdef MTMD_INTERNAL_HEADER +#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers" +#endif + +// +// internal logging functions +// + +struct mtmd_helper_logger { + ggml_log_callback default_callback = [](ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); + }; + + ggml_log_callback log_callback = default_callback; + void * log_callback_user_data; + + void log_v(enum ggml_log_level level, const char * format, va_list args) { + if (format == NULL) { + return; + } + va_list args_copy; + va_copy(args_copy, args); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + log_callback(level, buffer, log_callback_user_data); + } else { + char * buffer2 = (char *) calloc(len + 1, sizeof(char)); + vsnprintf(buffer2, len + 1, format, args_copy); + buffer2[len] = 0; + log_callback(level, buffer2, log_callback_user_data); + free(buffer2); + } + va_end(args_copy); + } + + void log(enum ggml_log_level level, const char * format, ...) { + va_list args; + va_start(args, format); + log_v(level, format, args); + va_end(args); + } +} g_logger; + +#define LOG_INF(...) g_logger.log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define LOG_WRN(...) g_logger.log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define LOG_ERR(...) g_logger.log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) + +void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data) { + if (log_callback == nullptr) { + log_callback = g_logger.default_callback; + } + g_logger.log_callback = log_callback; + g_logger.log_callback_user_data = user_data; + mtmd_log_set(log_callback, user_data); +} + +// +// helper functions +// + +size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) { + size_t n_tokens = 0; + for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) { + auto chunk = mtmd_input_chunks_get(chunks, i); + n_tokens += mtmd_input_chunk_get_n_tokens(chunk); + } + return n_tokens; +} + +llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) { + llama_pos n_pos = 0; + for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) { + auto chunk = mtmd_input_chunks_get(chunks, i); + n_pos += mtmd_input_chunk_get_n_pos(chunk); + } + return n_pos; +} + +// helper struct to make working with embd batch easier +// note: this will be removed after llama_batch_ext refactoring +struct decode_embd_batch { + int n_pos_per_embd; + int n_mmproj_embd; + std::vector<llama_pos> pos; + std::vector<llama_pos> pos_view; // used by mrope + std::vector<int32_t> n_seq_id; + std::vector<llama_seq_id> seq_id_0; + std::vector<llama_seq_id *> seq_ids; + std::vector<int8_t> logits; + llama_batch batch; + decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { + pos .resize(n_tokens * n_pos_per_embd); + n_seq_id.resize(n_tokens); + seq_ids .resize(n_tokens + 1); + logits .resize(n_tokens); + seq_id_0.resize(1); + seq_ids [n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ embd, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + } + + void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) { + seq_id_0[0] = seq_id; + for (int i = 0; i < batch.n_tokens; i++) { + batch.pos [i] = pos_0 + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } + + // M-RoPE for image + void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) { + GGML_ASSERT(n_pos_per_embd == 4); + seq_id_0[0] = seq_id; + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + int i = y * nx + x; + pos[i ] = pos_0; + pos[i + batch.n_tokens ] = pos_0 + y; + pos[i + batch.n_tokens * 2] = pos_0 + x; + pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused + } + } + for (int i = 0; i < batch.n_tokens; i++) { + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } + + // M-RoPE for audio + void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) { + GGML_ASSERT(n_pos_per_embd == 4); + seq_id_0[0] = seq_id; + for (int i = 0; i < batch.n_tokens; i++) { + pos[i ] = pos_0 + i; + pos[i + batch.n_tokens ] = pos_0 + i; + pos[i + batch.n_tokens * 2] = pos_0 + i; + pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused + } + for (int i = 0; i < batch.n_tokens; i++) { + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } + + llama_batch get_view(int offset, int n_tokens) { + llama_pos * pos_ptr; + pos_view.clear(); + pos_view.reserve(n_tokens * n_pos_per_embd); + if (n_pos_per_embd > 1) { + // mrope + // for example, with layout of src: 1234...1234...1234...1234... + // offset 2 will give us dst: 34...34...34...34... + for (int i = 0; i < n_pos_per_embd; i++) { + // assume n_tokens is less than or equal to batch.n_tokens + // batch.n_tokens is number of **total** tokens + // n_tokens is number of viewed token + size_t src_idx = i * batch.n_tokens + offset; + pos_view.insert(pos_view.end(), + pos.data() + src_idx, + pos.data() + src_idx + n_tokens); + } + pos_ptr = pos_view.data(); + } else { + // normal + pos_ptr = pos.data() + offset; + } + return { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ batch.embd + offset * n_mmproj_embd, + /*pos =*/ pos_ptr, + /*n_seq_id =*/ batch.n_seq_id + offset, + /*seq_id =*/ batch.seq_id + offset, + /*logits =*/ batch.logits + offset, + }; + } +}; + +// Helper function for decoding an image whose embeddings have already been calculated +int32_t mtmd_helper_decode_image_chunk( + mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + float * encoded_embd, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + llama_pos * new_n_past) { + auto chunk_type = mtmd_input_chunk_get_type(chunk); + const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio"; + if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + LOG_ERR("failed to decode chunk: input chunk not of image/audio type\n"); + return -1; + } + + const llama_model * model = llama_get_model(lctx); + int n_mmproj_embd = llama_model_n_embd_inp(model); + int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1; + + int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk); + int32_t i_batch = 0; + int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; + decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd); + + if (mtmd_decode_use_mrope(ctx)) { + if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk); + if (!image_tokens) { + LOG_ERR("failed to decode chunk: image tokens are null\n"); + return -1; + } + const int nx = mtmd_image_tokens_get_nx(image_tokens); + const int ny = mtmd_image_tokens_get_ny(image_tokens); + batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id); + } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + batch_embd.set_position_mrope_1d(n_past, seq_id); + } else { + GGML_ABORT("invalid chunk type for M-RoPE"); + } + } else { + batch_embd.set_position_normal(n_past, seq_id); + } + + if (mtmd_decode_use_non_causal(ctx)) { + llama_set_causal_attn(lctx, false); + // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image + } + + while (i_batch < n_img_batches) { // split into batches + int pos_offset = i_batch*n_batch; + int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset); + llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch); + + LOG_INF("decoding %s batch %d/%d, n_tokens_batch = %d\n", name, i_batch+1, n_img_batches, n_tokens_batch); + + int64_t t1 = ggml_time_ms(); + int32_t ret = llama_decode(lctx, batch_embd_view); + if (ret != 0) { + LOG_ERR("failed to decode %s\n", name); + llama_set_causal_attn(lctx, true); // restore causal attn + return ret; + } + + LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1); + + i_batch++; + } + + n_past += mtmd_input_chunk_get_n_pos(chunk); + *new_n_past = n_past; + + if (mtmd_decode_use_non_causal(ctx)) { + llama_set_causal_attn(lctx, true); + } + return 0; +} + +int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past) { + int32_t ret; + llama_batch text_batch = llama_batch_init(n_batch, 0, 1); + auto chunk_type = mtmd_input_chunk_get_type(chunk); + + if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + size_t n_tokens; + const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens); + // LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens); + size_t i = 0; + while (i < n_tokens) { // split into batches + text_batch.n_tokens = 0; // clear the batch + for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) { + int32_t j = text_batch.n_tokens; + text_batch.token [j] = tokens[i]; + text_batch.pos [j] = n_past++; + text_batch.n_seq_id[j] = 1; + text_batch.seq_id [j][0] = seq_id; + text_batch.logits [j] = false; + + text_batch.n_tokens++; + } + bool is_last_token = (i == n_tokens); + if (logits_last && is_last_token) { + text_batch.logits[text_batch.n_tokens - 1] = true; + } + ret = llama_decode(lctx, text_batch); + if (ret != 0) { + LOG_ERR("failed to decode text\n"); + llama_batch_free(text_batch); + return ret; + } + *new_n_past += text_batch.n_tokens; + } + + } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio"; + int64_t t0 = ggml_time_ms(); + + LOG_INF("encoding %s slice...\n", name); + + ret = mtmd_encode_chunk(ctx, chunk); + if (ret != 0) { + LOG_ERR("failed to encode %s slice\n", name); + llama_batch_free(text_batch); + return ret; + } + + LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0); + + float * embd = mtmd_get_output_embd(ctx); + ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past); + if (ret != 0) { + LOG_ERR("failed to decode %s\n", name); + llama_batch_free(text_batch); + return ret; + } + } else { + GGML_ABORT("chunk type not supported"); + } + + llama_batch_free(text_batch); + return 0; +} + +int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunks * chunks, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past) { + size_t n_chunks = mtmd_input_chunks_size(chunks); + if (n_chunks == 0) { + LOG_WRN("no chunks to eval\n"); + return 0; + } + + for (size_t i = 0; i < n_chunks; i++) { + bool chunk_logits_last = (i == n_chunks - 1) && logits_last; + auto chunk = mtmd_input_chunks_get(chunks, i); + + int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past); + if (res != 0) { + LOG_ERR("failed to eval chunk %zu\n", i); + return res; + } + *new_n_past = n_past; + } + + return 0; +} + +namespace audio_helpers { + +static bool is_audio_file(const char * buf, size_t len) { + if (len < 12) { + return false; + } + + // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format + // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html + bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0; + bool is_mp3 = len >= 3 && ( + memcmp(buf, "ID3", 3) == 0 || + // Check for MPEG sync word (simplified check) + ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0) + ); + bool is_flac = memcmp(buf, "fLaC", 4) == 0; + + return is_wav || is_mp3 || is_flac; +} + +// returns true if the buffer is a valid audio file +static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) { + ma_result result; + const int channels = 1; + ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate); + ma_decoder decoder; + + result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder); + if (result != MA_SUCCESS) { + return false; + } + + ma_uint64 frame_count; + ma_uint64 frames_read; + result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count); + if (result != MA_SUCCESS) { + ma_decoder_uninit(&decoder); + return false; + } + + pcmf32_mono.resize(frame_count); + result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read); + if (result != MA_SUCCESS) { + ma_decoder_uninit(&decoder); + return false; + } + +#ifdef MTMD_AUDIO_DEBUG + // save audio to wav file + ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate); + ma_encoder encoder; + ma_encoder_init_file("output.wav", &config, &encoder); + ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read); + ma_encoder_uninit(&encoder); +#endif + + ma_decoder_uninit(&decoder); + return true; +} + +} // namespace audio_helpers + +mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) { + if (audio_helpers::is_audio_file((const char *)buf, len)) { + std::vector<float> pcmf32; + int bitrate = mtmd_get_audio_bitrate(ctx); + if (bitrate < 0) { + LOG_ERR("This model does not support audio input\n"); + return nullptr; + } + if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) { + LOG_ERR("Unable to read WAV audio file from buffer\n"); + return nullptr; + } + return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data()); + } + + // otherwise, we assume it's an image + mtmd_bitmap * result = nullptr; + { + int nx, ny, nc; + auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3); + if (!data) { + LOG_ERR("%s: failed to decode image bytes\n", __func__); + return nullptr; + } + result = mtmd_bitmap_init(nx, ny, data); + stbi_image_free(data); + } + return result; +} + +mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) { + std::vector<unsigned char> buf; + FILE * f = fopen(fname, "rb"); + if (!f) { + LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno)); + return nullptr; + } + + fseek(f, 0, SEEK_END); + long file_size = ftell(f); + fseek(f, 0, SEEK_SET); + buf.resize(file_size); + + size_t n_read = fread(buf.data(), 1, file_size, f); + fclose(f); + if (n_read != (size_t)file_size) { + LOG_ERR("Failed to read entire file %s", fname); + return nullptr; + } + + return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size()); +} diff --git a/llama.cpp/tools/mtmd/mtmd-helper.h b/llama.cpp/tools/mtmd/mtmd-helper.h new file mode 100644 index 0000000..5036b92 --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd-helper.h @@ -0,0 +1,96 @@ +#ifndef MTMD_HELPER_H +#define MTMD_HELPER_H + +#include "ggml.h" +#include "llama.h" +#include "mtmd.h" + +#include <stddef.h> +#include <stdint.h> +#include <stdbool.h> + +#ifdef __cplusplus +extern "C" { +#endif + +// +// libmtmd helper functions +// +// Please note that these helpers are not guaranteed to be stable. +// BREAKING CHANGES are expected. +// + +// Set callback for all future logging events. +// If this is not called, or NULL is supplied, everything is output on stderr. +// Note: this also call mtmd_log_set() internally +MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data); + +// helper function to construct a mtmd_bitmap from a file +// it calls mtmd_helper_bitmap_init_from_buf() internally +// returns nullptr on failure +// this function is thread-safe +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); + +// helper function to construct a mtmd_bitmap from a buffer containing a file +// supported formats: +// image: formats supported by stb_image: jpg, png, bmp, gif, etc. +// audio: formats supported by miniaudio: wav, mp3, flac +// note: audio files will be auto-detected based on magic bytes +// returns nullptr on failure +// this function is thread-safe +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); + +// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache +MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); + +// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past +// normally, n_pos is equal to n_tokens, but for M-RoPE it is different +MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); + +// helper function that automatically: +// 1. run llama_decode() on text chunks +// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() +// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error +// otherwise, returns 0 on success +// this function is NOT thread-safe +MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunks * chunks, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past); + +// works like mtmd_helper_eval_chunks(), but only for a single chunk +// this function is NOT thread-safe +MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past); + +// helper function to decode an image whose embeddings have already been calculated +// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention) +// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure +MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + float * encoded_embd, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + llama_pos * new_n_past); + +#ifdef __cplusplus +} // extern "C" +#endif + +// +// C++ wrappers +// + +#endif diff --git a/llama.cpp/tools/mtmd/mtmd.cpp b/llama.cpp/tools/mtmd/mtmd.cpp new file mode 100644 index 0000000..b763627 --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd.cpp @@ -0,0 +1,1151 @@ +#include "clip.h" +#include "clip-impl.h" +#include "mtmd.h" +#include "mtmd-audio.h" + +#include "llama.h" + +// fix problem with std::min and std::max +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include <windows.h> +#endif + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <vector> + +// represents raw image data, layout is RGBRGBRGB... +// length of data must be nx * ny * 3 +struct mtmd_bitmap { + uint32_t nx; + uint32_t ny; + std::vector<unsigned char> data; + std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking + bool is_audio = false; // true if the bitmap is audio +}; + +struct mtmd_image_tokens { + uint32_t nx; // number of tokens in x direction + uint32_t ny; // number of tokens in y direction + bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position) + uint32_t n_tokens() const { return nx * ny; } + clip_image_f32_batch batch_f32; // preprocessed image patches + std::string id; // optional user-defined ID, useful for KV cache tracking + + mtmd_image_tokens clone() { + return mtmd_image_tokens{ + nx, + ny, + use_mrope_pos, + batch_f32.clone(), + id + }; + } +}; +using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>; + +struct mtmd_audio_tokens { + uint32_t n_tokens; // number of tokens + clip_image_f32_batch batch_f32; // preprocessed image patches + std::string id; // optional user-defined ID, useful for KV cache tracking + + mtmd_audio_tokens clone() { + return mtmd_audio_tokens{ + n_tokens, + batch_f32.clone(), + id + }; + } +}; +using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>; + +struct mtmd_input_chunk { + mtmd_input_chunk_type type; + std::vector<llama_token> tokens_text; + mtmd_image_tokens_ptr tokens_image; + mtmd_audio_tokens_ptr tokens_audio; +}; + +struct mtmd_input_chunks { + std::vector<mtmd_input_chunk> entries; +}; + +// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings +// models not having it (llava-1.6) will process embeddings without any special tokens in-between +enum mtmd_slice_tmpl { + MTMD_SLICE_TMPL_NONE, + MTMD_SLICE_TMPL_MINICPMV_2_5, + MTMD_SLICE_TMPL_MINICPMV_2_6, + MTMD_SLICE_TMPL_LLAMA4, + MTMD_SLICE_TMPL_IDEFICS3, + MTMD_SLICE_TMPL_LFM2, +}; + +const char * mtmd_default_marker() { + return "<__media__>"; +} + +static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) { + switch (flash_attn_type) { + case LLAMA_FLASH_ATTN_TYPE_AUTO: return CLIP_FLASH_ATTN_TYPE_AUTO; + case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED; + case LLAMA_FLASH_ATTN_TYPE_ENABLED: return CLIP_FLASH_ATTN_TYPE_ENABLED; + } + return CLIP_FLASH_ATTN_TYPE_AUTO; +} + +mtmd_context_params mtmd_context_params_default() { + mtmd_context_params params { + /* use_gpu */ true, + /* print_timings */ true, + /* n_threads */ 4, + /* image_marker */ MTMD_DEFAULT_IMAGE_MARKER, + /* media_marker */ mtmd_default_marker(), + /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO, + /* warmup */ true, + /* image_min_tokens */ -1, + /* image_max_tokens */ -1, + /* cb_eval */ nullptr, + /* cb_eval_user_data */ nullptr, + }; + return params; +} + +struct mtmd_context { + struct clip_ctx * ctx_v; // vision + struct clip_ctx * ctx_a; // audio + const struct llama_model * text_model; + std::vector<float> image_embd_v; // image embedding vector + + bool print_timings; + int n_threads; + std::string media_marker; + const int n_embd_text; + + // these are not token, but strings used to mark the beginning and end of image/audio embeddings + std::string img_beg; + std::string img_end; + std::string aud_beg; + std::string aud_end; + + // for llava-uhd style models, we need special tokens in-between slices + // minicpmv calls them "slices", llama 4 calls them "tiles" + mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE; + std::vector<llama_token> tok_ov_img_start; // overview image + std::vector<llama_token> tok_ov_img_end; // overview image + std::vector<llama_token> tok_slices_start; // start of all slices + std::vector<llama_token> tok_slices_end; // end of all slices + std::vector<llama_token> tok_sli_img_start; // single slice start + std::vector<llama_token> tok_sli_img_end; // single slice end + std::vector<llama_token> tok_sli_img_mid; // between 2 slices + std::vector<llama_token> tok_row_end; // end of row + bool tok_row_end_trail = false; + bool ov_img_first = false; + + // string template for slice image delimiters with row/col (idefics3) + std::string sli_img_start_tmpl; + + std::unique_ptr<mtmd_audio_preprocessor> audio_preproc; + + // TODO @ngxson : add timings + + mtmd_context(const char * mmproj_fname, + const llama_model * text_model, + const mtmd_context_params & ctx_params) : + text_model (text_model), + print_timings(ctx_params.print_timings), + n_threads (ctx_params.n_threads), + media_marker (ctx_params.media_marker), + n_embd_text (llama_model_n_embd_inp(text_model)) + { + if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) { + throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); + } + + if (media_marker.empty()) { + throw std::runtime_error("media_marker must not be empty"); + } + + clip_context_params ctx_clip_params { + /* use_gpu */ ctx_params.use_gpu, + /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO, + /* image_min_tokens */ ctx_params.image_min_tokens, + /* image_max_tokens */ ctx_params.image_max_tokens, + /* warmup */ ctx_params.warmup, + /* cb_eval */ ctx_params.cb_eval, + /* cb_eval_user_data */ ctx_params.cb_eval_user_data, + }; + + auto res = clip_init(mmproj_fname, ctx_clip_params); + ctx_v = res.ctx_v; + ctx_a = res.ctx_a; + if (!ctx_v && !ctx_a) { + throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname)); + } + + // if both vision and audio mmproj are present, we need to validate their n_embd + if (ctx_v && ctx_a) { + int n_embd_v = clip_n_mmproj_embd(ctx_v); + int n_embd_a = clip_n_mmproj_embd(ctx_a); + if (n_embd_v != n_embd_a) { + throw std::runtime_error(string_format( + "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n", + n_embd_v, n_embd_a)); + } + } + + // since we already validate n_embd of vision and audio mmproj, + // we can safely assume that they are the same + int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a); + if (n_embd_text != n_embd_clip) { + throw std::runtime_error(string_format( + "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n" + "hint: you may be using wrong mmproj\n", + n_embd_text, n_embd_clip)); + } + if (ctx_v) { + init_vision(); + } + if (ctx_a) { + init_audio(); + } + } + + void init_vision() { + GGML_ASSERT(ctx_v != nullptr); + + projector_type proj = clip_get_projector_type(ctx_v); + int minicpmv_version = clip_is_minicpmv(ctx_v); + if (minicpmv_version == 2) { + // minicpmv 2.5 format: + // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice> + slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5; + tok_ov_img_start = {lookup_token("<image>")}; + tok_ov_img_end = {lookup_token("</image>")}; + tok_slices_start = {lookup_token("<slice>")}; + tok_slices_end = {lookup_token("</slice>")}; + tok_sli_img_start = tok_ov_img_start; + tok_sli_img_end = tok_ov_img_end; + tok_row_end = {lookup_token("\n")}; + tok_row_end_trail = false; // no trailing end-of-row token + ov_img_first = true; + + } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) { + // minicpmv 2.6 format: + // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ... + slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6; + tok_ov_img_start = {lookup_token("<image>")}; + tok_ov_img_end = {lookup_token("</image>")}; + tok_sli_img_start = {lookup_token("<slice>")}; + tok_sli_img_end = {lookup_token("</slice>")}; + tok_row_end = {lookup_token("\n")}; + tok_row_end_trail = false; // no trailing end-of-row token + ov_img_first = true; + + } else if (minicpmv_version != 0) { + GGML_ASSERT(false && "unsupported minicpmv version"); + } else if (proj == PROJECTOR_TYPE_LLAMA4) { + // llama 4 format: + // <|image_start|> + // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|> + // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|> + // ... <|tile_y_separator|> <-- trailing end-of-row token + // <|image|> (overview) <-- overview image is last + // <|image_end|> + slice_tmpl = MTMD_SLICE_TMPL_LLAMA4; + tok_ov_img_start = {lookup_token("<|image|>")}; + tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")}; + tok_row_end = {lookup_token("<|tile_y_separator|>")}; + tok_row_end_trail = true; // add trailing end-of-row token + ov_img_first = false; // overview image is last + } + + // set boi/eoi + if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) { + // <start_of_image> ... (image embeddings) ... <end_of_image> + img_beg = "<start_of_image>"; + img_end = "<end_of_image>"; + + } else if (proj == PROJECTOR_TYPE_IDEFICS3) { + // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215 + slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3; + tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")}; + tok_ov_img_end = {lookup_token("<fake_token_around_image>")}; + tok_row_end = {lookup_token("\n")}; + sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>"; + + } else if (proj == PROJECTOR_TYPE_PIXTRAL) { + // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md + img_end = "[IMG_END]"; + + } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) { + // <|vision_start|> ... (image embeddings) ... <|vision_end|> + img_beg = "<|vision_start|>"; + img_end = "<|vision_end|>"; + + } else if (proj == PROJECTOR_TYPE_LLAMA4) { + // (more details in mtmd_context constructor) + img_beg = "<|image_start|>"; + img_end = "<|image_end|>"; + LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n" + " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__); + + } else if (proj == PROJECTOR_TYPE_INTERNVL) { + // <img> ... (image embeddings) ... </img> + img_beg = "<img>"; + img_end = "</img>"; + + } else if (proj == PROJECTOR_TYPE_LIGHTONOCR) { + // <|im_start|> ... (image embeddings) ... <|im_end|> + img_beg = "<|im_start|>"; + img_end = "<|im_end|>"; + + } else if (proj == PROJECTOR_TYPE_LFM2) { + // multi-tile: + // <|image_start|> + // <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ... + // <|img_thumbnail|> (thumbnail) + // <|image_end|> + // single-tile: + // <|image_start|> (image) <|image_end|> + img_beg = "<|image_start|>"; + img_end = "<|image_end|>"; + slice_tmpl = MTMD_SLICE_TMPL_LFM2; + sli_img_start_tmpl = "<|img_row_%d_col_%d|>"; + tok_ov_img_start = {lookup_token("<|img_thumbnail|>")}; + ov_img_first = false; + } else if (proj == PROJECTOR_TYPE_GLM4V) { + img_beg = "<|begin_of_image|>"; + img_end = "<|end_of_image|>"; + + } + } + + void init_audio() { + GGML_ASSERT(ctx_a != nullptr); + projector_type proj = clip_get_projector_type(ctx_a); + + LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n" + " https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__); + + // set preprocessor + switch (proj) { + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_QWEN25O: + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a); + break; + case PROJECTOR_TYPE_LFM2A: + audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a); + break; + default: + GGML_ABORT("unsupported audio projector type"); + } + + // initialize audio preprocessor + audio_preproc->initialize(); + + // set special tokens + if (proj == PROJECTOR_TYPE_QWEN2A) { + // <|audio_bos|> ... (embeddings) ... <|audio_eos|> + aud_beg = "<|audio_bos|>"; + aud_end = "<|audio_eos|>"; + + } else if (proj == PROJECTOR_TYPE_ULTRAVOX) { + // [BEGIN_AUDIO] ... (embeddings) ... + aud_beg = "[BEGIN_AUDIO]"; + + } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) { + // <sound> ... (embeddings) ... + aud_beg = "<sound>"; + } + } + + // get clip ctx based on chunk type + clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const { + if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + return ctx_v; + } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + return ctx_a; + } + GGML_ABORT("unknown chunk type"); + } + + projector_type proj_type_v() const { + return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN; + } + + projector_type proj_type_a() const { + return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN; + } + + ~mtmd_context() { + clip_free(ctx_a); + clip_free(ctx_v); + } + +private: + llama_token lookup_token(const std::string & token_text) { + const llama_vocab * vocab = llama_model_get_vocab(text_model); + const int n_vocab = llama_vocab_n_tokens(vocab); + for (int i = 0; i < n_vocab; i++) { + if (token_to_piece(vocab, i, true) == token_text) { + return i; + } + } + return LLAMA_TOKEN_NULL; + } + + std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) { + std::string piece; + piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' + const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); + if (n_chars < 0) { + piece.resize(-n_chars); + int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); + GGML_ASSERT(check == -n_chars); + } else { + piece.resize(n_chars); + } + return piece; + } +}; + +mtmd_context * mtmd_init_from_file(const char * mmproj_fname, + const struct llama_model * text_model, + const struct mtmd_context_params ctx_params) { + try { + return new mtmd_context(mmproj_fname, text_model, ctx_params); + } catch (const std::exception & e) { + LOG_ERR("%s: error: %s\n", __func__, e.what()); + return nullptr; + } +} + +void mtmd_free(mtmd_context * ctx) { + delete ctx; +} + +struct mtmd_tokenizer { + mtmd_context * ctx; + std::vector<const mtmd_bitmap *> bitmaps; + + std::string input_text; + bool add_special; + bool parse_special; + const llama_vocab * vocab; + + mtmd_input_chunks cur; + + mtmd_tokenizer(mtmd_context * ctx, + const mtmd_input_text * text, + const mtmd_bitmap ** bitmaps, + size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) { + add_special = text->add_special; + parse_special = text->parse_special; + input_text = text->text; + vocab = llama_model_get_vocab(ctx->text_model); + + // for compatibility, we convert image marker to media marker + string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker); + } + + int32_t tokenize(mtmd_input_chunks * output) { + cur.entries.clear(); + std::vector<std::string> parts = split_text(input_text, ctx->media_marker); + size_t i_bm = 0; // index of the current bitmap + for (auto & part : parts) { + if (part == ctx->media_marker) { + // this is a marker, we should add the next bitmap + if (i_bm >= bitmaps.size()) { + LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n", + __func__, bitmaps.size(), parts.size() - 1); + return 1; + } + const mtmd_bitmap * bitmap = bitmaps[i_bm++]; + int32_t res = add_media(bitmap); + if (res != 0) { + return res; + } + } else { + // this is a text part, we should add it as text + add_text(part, parse_special); + } + } + + if (add_special && llama_vocab_get_add_bos(vocab)) { + // if first chunk is text, we add BOS token to first text chunk + // otherwise, create a new text chunk with BOS token + if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + // add BOS token to the beginning of first text chunk + cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab)); + } else { + // create a new text chunk with BOS token at the beginning + mtmd_input_chunk bos_chunk{ + MTMD_INPUT_CHUNK_TYPE_TEXT, + {llama_vocab_bos(vocab)}, + nullptr, // image tokens + nullptr, // audio tokens + }; + cur.entries.insert(cur.entries.begin(), std::move(bos_chunk)); + } + } + + if (add_special && llama_vocab_get_add_eos(vocab)) { + // if last chunk is text, we add EOS token to it + add_text({llama_vocab_eos(vocab)}); + } + + if (i_bm != bitmaps.size()) { + LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n", + __func__, bitmaps.size(), parts.size() - 1); + return 1; + } + + *output = std::move(cur); + + return 0; + } + + void add_text(const std::string & txt, bool parse_special) { + LOG_DBG("%s: %s\n", __func__, txt.c_str()); + auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special); + add_text(tokens); + } + + void add_text(const std::vector<llama_token> & tokens) { + if (tokens.empty()) { + return; + } + // if last entry is also a text chunk, add tokens to it instead of creating new chunk + if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + cur.entries.back().tokens_text.insert( + cur.entries.back().tokens_text.end(), + tokens.begin(), + tokens.end()); + } else { + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_TEXT, + tokens, + nullptr, // image tokens + nullptr, // audio tokens + }; + cur.entries.emplace_back(std::move(chunk)); + } + } + + int32_t add_media(const mtmd_bitmap * bitmap) { + if (!bitmap->is_audio) { + // handle image + + if (!ctx->ctx_v) { + LOG_ERR("%s: error: model does not support vision input\n", __func__); + return 2; + } + + if (!ctx->img_beg.empty()) { + add_text(ctx->img_beg, true); // add image begin token + } + + // convert mtmd_bitmap to clip_image_u8 + clip_image_u8_ptr img_u8(clip_image_u8_init()); + img_u8->nx = bitmap->nx; + img_u8->ny = bitmap->ny; + img_u8->buf.resize(bitmap->data.size()); + std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3); + + // preprocess image + clip_image_f32_batch batch_f32; + bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32); + if (!ok) { + LOG_ERR("Unable to preprocess image\n"); + return 2; + } + + // handle llava-uhd style preprocessing + const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0; + if ( + ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 + || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6 + || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4 + || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3 + || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid) + ) { + const int n_col = batch_f32.grid_x; + const int n_row = batch_f32.grid_y; + // split batch into chunks of single images + // NOTE: batch_f32 will be invalidated after this call + auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id); + GGML_ASSERT(chunks.size() > 0); + + auto ov_chunk = std::move(chunks.front()); + chunks.erase(chunks.begin()); + + // add overview image (first) + if (ctx->ov_img_first) { + add_text(ctx->tok_ov_img_start); + cur.entries.emplace_back(std::move(ov_chunk)); + add_text(ctx->tok_ov_img_end); + } + + // add slices (or tiles) + if (!chunks.empty()) { + GGML_ASSERT((int)chunks.size() == n_row * n_col); + add_text(ctx->tok_slices_start); + for (int y = 0; y < n_row; y++) { + for (int x = 0; x < n_col; x++) { + const bool is_last_in_row = (x == n_col - 1); + if (!ctx->tok_sli_img_start.empty()) { + add_text(ctx->tok_sli_img_start); + } else if (!ctx->sli_img_start_tmpl.empty()) { + // If using a template to preceed a slice image + const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1; + std::unique_ptr<char[]> buf(new char[sz]); + std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1); + add_text(std::string(buf.get(), buf.get() + sz - 1), true); + } + cur.entries.emplace_back(std::move(chunks[y * n_col + x])); + add_text(ctx->tok_sli_img_end); + if (!is_last_in_row) { + add_text(ctx->tok_sli_img_mid); + } + } + if ((y != n_row - 1 || ctx->tok_row_end_trail)) { + add_text(ctx->tok_row_end); + } + } + add_text(ctx->tok_slices_end); + } + + // add overview image (last) + if (!ctx->ov_img_first) { + add_text(ctx->tok_ov_img_start); + cur.entries.emplace_back(std::move(ov_chunk)); + add_text(ctx->tok_ov_img_end); + } + + } else { + size_t n_tokens = 0; + for (const auto & entry : batch_f32.entries) { + n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get()); + } + + mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); + if (mtmd_decode_use_mrope(ctx)) { + // for Qwen2VL, we need this information for M-RoPE decoding positions + image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get()); + image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get()); + image_tokens->use_mrope_pos = true; + } else { + // other models, we only need the total number of tokens + image_tokens->nx = n_tokens; + image_tokens->ny = 1; + } + image_tokens->batch_f32 = std::move(batch_f32); + image_tokens->id = bitmap->id; // optional + + LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx); + LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny); + LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size()); + + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_IMAGE, + {}, // text tokens + std::move(image_tokens), + nullptr, // audio tokens + }; + cur.entries.emplace_back(std::move(chunk)); + } + + if (!ctx->img_end.empty()) { + add_text(ctx->img_end, true); // add image end token + } + + } else { + // handle audio + + if (!ctx->ctx_a) { + LOG_ERR("%s: error: model does not support audio input\n", __func__); + return 2; + } + + if (bitmap->data.size() == 0) { + LOG_ERR("%s: error: empty audio data\n", __func__); + return 2; + } + + if (!ctx->aud_beg.empty()) { + add_text(ctx->aud_beg, true); // add audio begin token + } + + // preprocess audio + std::vector<mtmd_audio_mel> mel_spec_chunks; + const float * samples = (const float *)bitmap->data.data(); + size_t n_samples = bitmap->data.size() / sizeof(float); + bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks); + if (!ok) { + LOG_ERR("Unable to preprocess audio\n"); + return 2; + } + + // consider each mel_spec as a separate audio chunk + // TODO: maybe support batching, but this may come with memory cost + for (auto & mel_spec : mel_spec_chunks) { + clip_image_f32_ptr mel_f32(clip_image_f32_init()); + mel_f32->nx = mel_spec.n_len; + mel_f32->ny = mel_spec.n_mel; + mel_f32->buf = std::move(mel_spec.data); + size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get()); + + clip_image_f32_batch batch_f32; + batch_f32.is_audio = true; + batch_f32.entries.push_back(std::move(mel_f32)); + + mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens); + audio_tokens->n_tokens = n_tokens; + audio_tokens->batch_f32 = std::move(batch_f32); + audio_tokens->id = bitmap->id; // optional + + LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens); + + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_AUDIO, + {}, // text tokens + nullptr, // image tokens + std::move(audio_tokens), + }; + cur.entries.emplace_back(std::move(chunk)); + } + + if (!ctx->aud_end.empty()) { + add_text(ctx->aud_end, true); // add audio end token + } + } + + return 0; + } + + std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) { + std::vector<mtmd_input_chunk> chunks; + + for (auto & entry : batch_f32.entries) { + mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); + image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get()); + image_tokens->ny = 1; + image_tokens->batch_f32.entries.push_back(std::move(entry)); + image_tokens->id = id; + + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_IMAGE, + {}, // text tokens + std::move(image_tokens), + nullptr, // audio tokens + }; + chunks.emplace_back(std::move(chunk)); + } + + return chunks; + } + + // for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c" + static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) { + std::vector<std::string> result; + if (input.empty()) { + return result; + } + size_t start = 0; + size_t pos = 0; + while ((pos = input.find(delimiter, start)) != std::string::npos) { + if (pos > start) { + result.push_back(input.substr(start, pos - start)); + } + result.push_back(delimiter); + start = pos + delimiter.length(); + } + if (start < input.length()) { + result.push_back(input.substr(start)); + } + return result; + } + + // copied from common_tokenize + static std::vector<llama_token> mtmd_tokenize_text_internal( + const struct llama_vocab * vocab, + const std::string & text, + bool add_special, + bool parse_special) { + // upper limit for the number of tokens + int n_tokens = text.length() + 2 * add_special; + std::vector<llama_token> result(n_tokens); + n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + if (n_tokens < 0) { + result.resize(-n_tokens); + int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + GGML_ASSERT(check == -n_tokens); + } else { + result.resize(n_tokens); + } + return result; + } +}; + +int32_t mtmd_tokenize(mtmd_context * ctx, + mtmd_input_chunks * output, + const mtmd_input_text * text, + const mtmd_bitmap ** bitmaps, + size_t n_bitmaps) { + mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps); + return tokenizer.tokenize(output); +} + +int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { + if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n"); + return 0; + } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + if (!ctx->ctx_v) { + LOG_ERR("%s: model does not support vision input\n", __func__); + return 1; + } + return mtmd_encode(ctx, chunk->tokens_image.get()); + } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + if (!ctx->ctx_a) { + LOG_ERR("%s: model does not support audio input\n", __func__); + return 1; + } + int n_mmproj_embd = ctx->n_embd_text; + ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); + bool ok = clip_image_batch_encode( + ctx->ctx_a, + ctx->n_threads, + &chunk->tokens_audio->batch_f32, + ctx->image_embd_v.data()); + return ok ? 0 : 1; + } + + LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type); + return 1; +} + +int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { + clip_ctx * ctx_clip = ctx->ctx_v; + if (!ctx_clip) { + LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__); + return 1; + } + int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); + ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); + bool ok = false; + + if (clip_is_llava(ctx_clip) + || clip_is_minicpmv(ctx_clip) + || clip_is_glm(ctx_clip)) { + // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() + const auto & entries = image_tokens->batch_f32.entries; + for (size_t i = 0; i < entries.size(); i++) { + int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); + ok = clip_image_encode( + ctx_clip, + ctx->n_threads, + entries[i].get(), + ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image); + } + } else { + ok = clip_image_batch_encode( + ctx_clip, + ctx->n_threads, + &image_tokens->batch_f32, + ctx->image_embd_v.data()); + } + + return ok ? 0 : 1; +} + +float * mtmd_get_output_embd(mtmd_context * ctx) { + return ctx->image_embd_v.data(); +} + +bool mtmd_decode_use_non_causal(mtmd_context * ctx) { + switch (ctx->proj_type_v()) { + case PROJECTOR_TYPE_GEMMA3: + return true; + default: + return false; + } +} + +bool mtmd_decode_use_mrope(mtmd_context * ctx) { + switch (ctx->proj_type_v()) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + return true; + default: + return false; + } +} + +bool mtmd_support_vision(mtmd_context * ctx) { + return ctx->ctx_v != nullptr; +} + +bool mtmd_support_audio(mtmd_context * ctx) { + return ctx->ctx_a != nullptr; +} + +int mtmd_get_audio_bitrate(mtmd_context * ctx) { + if (!ctx->ctx_a) { + return -1; + } + return clip_get_hparams(ctx->ctx_a)->audio_sample_rate; +} + +// +// public API functions +// + +// mtmd_bitmap + +mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, + uint32_t ny, + const unsigned char * data) { + mtmd_bitmap * bitmap = new mtmd_bitmap; + bitmap->nx = nx; + bitmap->ny = ny; + size_t data_size = (size_t)nx * ny * 3; + bitmap->data.resize(data_size); + std::memcpy(bitmap->data.data(), data, data_size); + return bitmap; +} + +mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, + const float * data) { + mtmd_bitmap * bitmap = new mtmd_bitmap; + bitmap->nx = n_samples; + bitmap->ny = 1; + bitmap->is_audio = true; + size_t data_size = n_samples * sizeof(float); + bitmap->data.resize(data_size); + std::memcpy(bitmap->data.data(), data, data_size); + return bitmap; +} + +uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) { + return bitmap->nx; +} + +uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) { + return bitmap->ny; +} + +const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) { + return bitmap->data.data(); +} + +size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) { + return bitmap->data.size(); +} + +bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) { + return bitmap->is_audio; +} + +const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) { + return bitmap->id.c_str(); +} + +void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) { + if (id) { + bitmap->id = std::string(id); + } else { + bitmap->id.clear(); + } +} + +void mtmd_bitmap_free(mtmd_bitmap * bitmap) { + if (bitmap) { + delete bitmap; + } +} + +// mtmd_input_chunks + +mtmd_input_chunks * mtmd_input_chunks_init() { + return new mtmd_input_chunks; +} + +size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) { + return chunks->entries.size(); +} + +const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) { + if (idx >= chunks->entries.size()) { + return nullptr; + } + return &chunks->entries[idx]; +} + +void mtmd_input_chunks_free(mtmd_input_chunks * chunks) { + if (chunks) { + delete chunks; + } +} + +// mtmd_input_chunk + +enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) { + return chunk->type; +} + +const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) { + if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + *n_tokens_output = chunk->tokens_text.size(); + return chunk->tokens_text.data(); + } + *n_tokens_output = 0; + return nullptr; +} + +const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) { + if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + return chunk->tokens_image.get(); + } + return nullptr; +} + +size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) { + if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + return chunk->tokens_text.size(); + } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get()); + } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + return chunk->tokens_audio->n_tokens; + } else { + GGML_ABORT("invalid chunk type"); + } +} + +llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) { + if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + return chunk->tokens_text.size(); + } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get()); + } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + return chunk->tokens_audio->n_tokens; + } else { + GGML_ABORT("invalid chunk type"); + } +} + +const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) { + if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + return chunk->tokens_image->id.c_str(); + } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + return chunk->tokens_audio->id.c_str(); + } + return nullptr; +} + +mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) { + mtmd_input_chunk * copy = new mtmd_input_chunk{ + chunk->type, + chunk->tokens_text, + nullptr, + nullptr, + }; + if (chunk->tokens_image) { + // copy the image tokens + copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens()); + *copy->tokens_image = chunk->tokens_image->clone(); + } + if (chunk->tokens_audio) { + // copy the audio tokens + copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens()); + *copy->tokens_audio = chunk->tokens_audio->clone(); + } + return copy; +} + +void mtmd_input_chunk_free(mtmd_input_chunk * chunk) { + if (chunk) { + delete chunk; + } +} + +// mtmd_image_tokens + +size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) { + return image_tokens->n_tokens(); +} + +size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) { + return image_tokens->nx; +} + +size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) { + return image_tokens->ny; +} + +const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) { + return image_tokens->id.c_str(); +} + +llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { + if (image_tokens->use_mrope_pos) { + // for M-RoPE, temporal dimension = max(t,h,w) + // t is omitted as we don't support video input + return std::max(image_tokens->nx, image_tokens->ny); + } + return image_tokens->n_tokens(); +} + +// test function + +mtmd_input_chunks * mtmd_test_create_input_chunks() { + mtmd_input_chunks * chunks = mtmd_input_chunks_init(); + if (!chunks) { + return nullptr; + } + + // create a text chunk + std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 }; + mtmd_input_chunk chunk_text{ + MTMD_INPUT_CHUNK_TYPE_TEXT, + std::move(tokens_text), + nullptr, // image tokens + nullptr, // audio tokens + }; + chunks->entries.emplace_back(std::move(chunk_text)); + + // create an image chunk + mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); + image_tokens->nx = 4; + image_tokens->ny = 4; + image_tokens->batch_f32.entries.resize(16); + image_tokens->id = "image_1"; + mtmd_input_chunk chunk_image{ + MTMD_INPUT_CHUNK_TYPE_IMAGE, + {}, // text tokens + std::move(image_tokens), + nullptr, // audio tokens + }; + chunks->entries.emplace_back(std::move(chunk_image)); + + return chunks; +} + +void mtmd_log_set(ggml_log_callback log_callback, void * user_data) { + g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default; + g_logger_state.log_callback_user_data = user_data; +} diff --git a/llama.cpp/tools/mtmd/mtmd.h b/llama.cpp/tools/mtmd/mtmd.h new file mode 100644 index 0000000..ef25d32 --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd.h @@ -0,0 +1,319 @@ +#ifndef MTMD_H +#define MTMD_H + +#include "ggml.h" +#include "llama.h" + +#include <stddef.h> +#include <stdint.h> +#include <stdbool.h> + +#ifdef __cplusplus +#include <string> +#include <vector> +#include <cinttypes> +#include <memory> +#endif + +/** + * libmtmd: A library for multimodal support in llama.cpp. + * + * WARNING: This API is experimental and subject to many BREAKING CHANGES. + * Issues related to API usage may receive lower priority support. + * + * For the usage, see an example in mtmd-cli.cpp + * + * For contributors: + * - Make sure the C API is aligned with the libllama C API (as in llama.h) + * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead + * - Keep the API minimal, do not expose internal details unless necessary + * + * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated. + * We encourage human contributors to ensure the quality and reliability of the codebase. + */ + +#ifdef LLAMA_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_BUILD +# define MTMD_API __declspec(dllexport) +# else +# define MTMD_API __declspec(dllimport) +# endif +# else +# define MTMD_API __attribute__ ((visibility ("default"))) +# endif +#else +# define MTMD_API +#endif + +// deprecated marker, use mtmd_default_marker() instead +#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>" + +#ifdef __cplusplus +extern "C" { +#endif + +enum mtmd_input_chunk_type { + MTMD_INPUT_CHUNK_TYPE_TEXT, + MTMD_INPUT_CHUNK_TYPE_IMAGE, + MTMD_INPUT_CHUNK_TYPE_AUDIO, +}; + +// opaque types +struct mtmd_context; +struct mtmd_bitmap; +struct mtmd_image_tokens; +struct mtmd_input_chunk; +struct mtmd_input_chunks; + +struct mtmd_input_text { + const char * text; + bool add_special; + bool parse_special; +}; + +// +// C API +// + +typedef struct mtmd_context mtmd_context; +typedef struct mtmd_bitmap mtmd_bitmap; +typedef struct mtmd_image_tokens mtmd_image_tokens; +typedef struct mtmd_input_chunk mtmd_input_chunk; +typedef struct mtmd_input_chunks mtmd_input_chunks; +typedef struct mtmd_input_text mtmd_input_text; + +struct mtmd_context_params { + bool use_gpu; + bool print_timings; + int n_threads; + const char * image_marker; // deprecated, use media_marker instead + const char * media_marker; + enum llama_flash_attn_type flash_attn_type; + bool warmup; // whether to run a warmup encode pass after initialization + + // limit number of image tokens, only for vision models with dynamic resolution + int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) + int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) + + // callback function passed over to mtmd proper + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; +}; + +MTMD_API const char * mtmd_default_marker(void); + +MTMD_API struct mtmd_context_params mtmd_context_params_default(void); + +// initialize the mtmd context +// return nullptr on failure +MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname, + const struct llama_model * text_model, + const struct mtmd_context_params ctx_params); + +MTMD_API void mtmd_free(mtmd_context * ctx); + +// whether we need to set non-causal mask before llama_decode +MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); + +// whether the current model use M-RoPE for llama_decode +MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); + +// whether the current model supports vision input +MTMD_API bool mtmd_support_vision(mtmd_context * ctx); + +// whether the current model supports audio input +MTMD_API bool mtmd_support_audio(mtmd_context * ctx); + +// get audio bitrate in Hz, for example 16000 for Whisper +// return -1 if audio is not supported +MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); + +// mtmd_bitmap +// +// if bitmap is image: +// length of data must be nx * ny * 3 +// the data is in RGBRGBRGB... format +// if bitmap is audio: +// length of data must be n_samples * sizeof(float) +// the data is in float format (PCM F32) +MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); +MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); +MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap); +MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap); +MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap); +MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap); +MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap); +MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap); +// bitmap ID is optional, but useful for KV cache tracking +// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data() +MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap); +MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id); + + +// mtmd_input_chunks +// +// this is simply a list of mtmd_input_chunk +// the elements can only be populated via mtmd_tokenize() +MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void); +MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks); +MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx); +MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks); + +// mtmd_input_chunk +// +// the instance will be constructed via mtmd_tokenize() +// it will be freed along with mtmd_input_chunks +MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk); +MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output); +MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk); +MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk); +// returns nullptr for ID on text chunk +MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk); +// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise) +MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk); + +// in case you want to use custom logic to handle the chunk (i.e. KV cache management) +// you can move the chunk ownership to your own code by copying it +// remember to free the chunk when you are done with it +MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk); +MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); + + +// mtmd_image_tokens +// +// the instance will be constructed via mtmd_tokenize() +// it will be freed along with mtmd_input_chunk +MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate +MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); +MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate +// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise) +MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate + +// tokenize an input text prompt and a list of bitmaps (images/audio) +// the prompt must have the input image marker (default: "<__media__>") in it +// the default marker is defined by mtmd_default_marker() +// the marker will be replaced with the image/audio chunk +// for example: +// "here is an image: <__media__>\ndescribe it in detail." +// this will gives 3 chunks: +// 1. "here is an image: <start_of_image>" +// 2. (image/audio tokens) +// 3. "<end_of_image>\ndescribe it in detail." +// number of bitmaps must be equal to the number of markers in the prompt +// this function is thread-safe (shared ctx) +// return values: +// 0 on success +// 1 on number of bitmaps not matching the number of markers +// 2 on image preprocessing error +MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, + mtmd_input_chunks * output, + const mtmd_input_text * text, + const mtmd_bitmap ** bitmaps, + size_t n_bitmaps); + +// returns 0 on success +// TODO: deprecate +MTMD_API int32_t mtmd_encode(mtmd_context * ctx, + const mtmd_image_tokens * image_tokens); + +// returns 0 on success +MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, + const mtmd_input_chunk * chunk); + +// get output embeddings from the last encode pass +// the reading size (in bytes) is equal to: +// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float) +MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); + +// Set callback for all future logging events. +// If this is not called, or NULL is supplied, everything is output on stderr. +MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); + +///////////////////////////////////////// + +// test function, to be used in test-mtmd-c-api.c +MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +// +// C++ wrappers +// + +#ifdef __cplusplus + +namespace mtmd { + +struct mtmd_context_deleter { + void operator()(mtmd_context * val) { mtmd_free(val); } +}; +using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>; + +struct mtmd_bitmap_deleter { + void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); } +}; +using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>; + +struct mtmd_input_chunks_deleter { + void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); } +}; +using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>; + +struct mtmd_input_chunk_deleter { + void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); } +}; +using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>; + +struct bitmap { + bitmap_ptr ptr; + bitmap() : ptr(nullptr) {} + bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {} + bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {} + bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) { + ptr.reset(mtmd_bitmap_init(nx, ny, data)); + } + ~bitmap() = default; + uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); } + uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); } + const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); } + size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); } + std::string id() const { return mtmd_bitmap_get_id(ptr.get()); } + void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); } +}; + +struct bitmaps { + std::vector<bitmap> entries; + ~bitmaps() = default; + // return list of pointers to mtmd_bitmap + // example: + // auto bitmaps_c_ptr = bitmaps.c_ptr(); + // int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size()); + std::vector<const mtmd_bitmap *> c_ptr() { + std::vector<const mtmd_bitmap *> res(entries.size()); + for (size_t i = 0; i < entries.size(); i++) { + res[i] = entries[i].ptr.get(); + } + return res; + } +}; + +struct input_chunks { + input_chunks_ptr ptr; + input_chunks() = default; + input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {} + ~input_chunks() = default; + size_t size() const { return mtmd_input_chunks_size(ptr.get()); } + const mtmd_input_chunk * operator[](size_t idx) const { + return mtmd_input_chunks_get(ptr.get(), idx); + } +}; + +} // namespace mtmd + +#endif + +#endif diff --git a/llama.cpp/tools/mtmd/requirements.txt b/llama.cpp/tools/mtmd/requirements.txt new file mode 100644 index 0000000..0a1f4e8 --- /dev/null +++ b/llama.cpp/tools/mtmd/requirements.txt @@ -0,0 +1,5 @@ +-r ../../requirements/requirements-convert_legacy_llama.txt +--extra-index-url https://download.pytorch.org/whl/cpu +pillow~=11.3.0 +torch~=2.6.0 +torchvision~=0.21.0 diff --git a/llama.cpp/tools/mtmd/test-1.jpeg b/llama.cpp/tools/mtmd/test-1.jpeg Binary files differnew file mode 100644 index 0000000..7fdcaaf --- /dev/null +++ b/llama.cpp/tools/mtmd/test-1.jpeg diff --git a/llama.cpp/tools/mtmd/test-2.mp3 b/llama.cpp/tools/mtmd/test-2.mp3 Binary files differnew file mode 100644 index 0000000..aa9d7ec --- /dev/null +++ b/llama.cpp/tools/mtmd/test-2.mp3 diff --git a/llama.cpp/tools/mtmd/tests.sh b/llama.cpp/tools/mtmd/tests.sh new file mode 100755 index 0000000..012958e --- /dev/null +++ b/llama.cpp/tools/mtmd/tests.sh @@ -0,0 +1,183 @@ +#!/usr/bin/env bash + +# make sure we are in the right directory +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR + +#export LLAMA_CACHE="$SCRIPT_DIR/tmp" + +set -eux + +mkdir -p $SCRIPT_DIR/output + +PROJ_ROOT="$SCRIPT_DIR/../.." +cd $PROJ_ROOT + +# Check if the first argument is "big", then run test with big models +# This is useful if we're running the script on a larger machine, so we can test the big models +RUN_BIG_TESTS=false +if [ "${1:-}" = "big" ]; then + RUN_BIG_TESTS=true + echo "Include BIG models..." +fi + +RUN_HUGE_TESTS=false +if [ "${1:-}" = "huge" ]; then + RUN_HUGE_TESTS=true + RUN_BIG_TESTS=true + echo "Include BIG and HUGE models..." +fi + +############### + +arr_prefix=() +arr_hf=() +arr_extra_args=() +arr_file=() + +add_test_vision() { + local hf=$1 + shift + local extra_args="" + if [ $# -gt 0 ]; then + extra_args=$(printf " %q" "$@") + fi + arr_prefix+=("[vision]") + arr_hf+=("$hf") + arr_extra_args+=("$extra_args") + arr_file+=("test-1.jpeg") +} + +add_test_audio() { + local hf=$1 + shift + local extra_args="" + if [ $# -gt 0 ]; then + extra_args=$(printf " %q" "$@") + fi + arr_prefix+=("[audio] ") + arr_hf+=("$hf") + arr_extra_args+=("$extra_args") + arr_file+=("test-2.mp3") +} + +add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0" +add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M" +add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0" +add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M" +add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>" +add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna +add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna +add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M" +add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted +add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K" +add_test_vision "openbmb/MiniCPM-o-2_6-gguf:Q4_0" +add_test_vision "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" +add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" +add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0" +add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0" +add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M" +add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0" +add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0" +add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0" + +add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0" +add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M" +add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M" +add_test_audio "ggml-org/LFM2-Audio-1.5B-GGUF:Q8_0" + +# to test the big models, run: ./tests.sh big +if [ "$RUN_BIG_TESTS" = true ]; then + add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M" + add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7 + add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" + add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M" + add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" + add_test_vision "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M" + add_test_vision "ggml-org/Qwen3-VL-2B-Instruct-GGUF:Q8_0" + add_test_vision "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M" + add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M" + add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M" + # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra + # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working + + add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M" + add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M" +fi + +# to test the huge models, run: ./tests.sh huge +# this will run both the big and huge models +# huge models are > 32B parameters +if [ "$RUN_HUGE_TESTS" = true ]; then + add_test_vision "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M" + add_test_vision "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S" +fi + +# these models always give the wrong answer, not sure why +# add_test_vision "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M" +# add_test_vision "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0" +# add_test_vision "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0" + +# this model has broken chat template, not usable +# add_test_vision "cmp-nct/Yi-VL-6B-GGUF:Q5_K" +# add_test_vision "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek" + +############### + +cmake --build build -j --target llama-mtmd-cli + +arr_res=() + +for i in "${!arr_hf[@]}"; do + bin="llama-mtmd-cli" + prefix="${arr_prefix[$i]}" + hf="${arr_hf[$i]}" + extra_args="${arr_extra_args[$i]}" + inp_file="${arr_file[$i]}" + + echo "Running test with binary: $bin and HF model: $hf" + echo "" + echo "" + + cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \ + -hf $(printf %q "$hf") \ + --image $(printf %q "$SCRIPT_DIR/$inp_file") \ + --temp 0 -n 128 \ + ${extra_args}" + + # if extra_args does not contain -p, we add a default prompt + if ! [[ "$extra_args" =~ "-p" ]]; then + cmd+=" -p \"what is the publisher name of the newspaper?\"" + fi + + output=$(eval "$cmd" 2>&1 | tee /dev/tty) + + echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log + + # either contains "new york" or both "men" and "walk" + if echo "$output" | grep -iq "new york" \ + || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk") + then + result="$prefix \033[32mOK\033[0m: $hf" + else + result="$prefix \033[31mFAIL\033[0m: $hf" + fi + echo -e "$result" + arr_res+=("$result") + + echo "" + echo "" + echo "" + echo "#################################################" + echo "#################################################" + echo "" + echo "" +done + +set +x + +for i in "${!arr_res[@]}"; do + echo -e "${arr_res[$i]}" +done +echo "" +echo "Output logs are saved in $SCRIPT_DIR/output" |
