summaryrefslogtreecommitdiff
path: root/llama.cpp/tools/mtmd/models
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/tools/mtmd/models')
-rw-r--r--llama.cpp/tools/mtmd/models/cogvlm.cpp98
-rw-r--r--llama.cpp/tools/mtmd/models/conformer.cpp216
-rw-r--r--llama.cpp/tools/mtmd/models/glm4v.cpp120
-rw-r--r--llama.cpp/tools/mtmd/models/internvl.cpp69
-rw-r--r--llama.cpp/tools/mtmd/models/kimik25.cpp101
-rw-r--r--llama.cpp/tools/mtmd/models/kimivl.cpp63
-rw-r--r--llama.cpp/tools/mtmd/models/llama4.cpp96
-rw-r--r--llama.cpp/tools/mtmd/models/llava.cpp374
-rw-r--r--llama.cpp/tools/mtmd/models/minicpmv.cpp114
-rw-r--r--llama.cpp/tools/mtmd/models/mobilenetv5.cpp451
-rw-r--r--llama.cpp/tools/mtmd/models/models.h118
-rw-r--r--llama.cpp/tools/mtmd/models/pixtral.cpp86
-rw-r--r--llama.cpp/tools/mtmd/models/qwen2vl.cpp183
-rw-r--r--llama.cpp/tools/mtmd/models/qwen3vl.cpp193
-rw-r--r--llama.cpp/tools/mtmd/models/siglip.cpp86
-rw-r--r--llama.cpp/tools/mtmd/models/whisper-enc.cpp115
-rw-r--r--llama.cpp/tools/mtmd/models/youtuvl.cpp179
17 files changed, 2662 insertions, 0 deletions
diff --git a/llama.cpp/tools/mtmd/models/cogvlm.cpp b/llama.cpp/tools/mtmd/models/cogvlm.cpp
new file mode 100644
index 0000000..d5b739c
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/cogvlm.cpp
@@ -0,0 +1,98 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_cogvlm::build() {
+ GGML_ASSERT(model.class_embedding != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+
+ const int n_pos = n_patches + 1; // +1 for [CLS]
+
+ // build input and concatenate class embedding
+ ggml_tensor * inp = build_inp();
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+ inp = ggml_add(ctx0, inp, model.position_embeddings);
+ cb(inp, "inp_pos", -1);
+
+ ggml_tensor * inpL = inp;
+
+ for (int il = 0; il < n_layer; il++) {
+ auto & layer = model.layers[il];
+ ggml_tensor * cur = inpL;
+
+ cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+
+ cur = ggml_add(ctx0, cur, layer.qkv_b);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+ cur->nb[1], 0);
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+ cur->nb[1], n_embd * sizeof(float));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+ cur->nb[1], 2 * n_embd * sizeof(float));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+ cb(cur, "attn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ inpL = cur;
+
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ layer.ff_gate_w, layer.ff_gate_b,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ cb(cur, "ffn_out", il);
+
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+ cb(cur, "ffn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ cb(cur, "layer_out", il);
+ inpL = cur;
+
+ }
+
+ // remove CLS token (like build_llama4 does)
+ ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
+ n_embd, n_patches,
+ ggml_row_size(inpL->type, n_embd), 0);
+
+ // Multiply with mm_model_proj
+ cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+
+ // Apply layernorm, weight, bias
+ cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+
+ // Apply GELU
+ cur = ggml_gelu_inplace(ctx0, cur);
+
+ // Branch 1: multiply with mm_h_to_4h_w
+ ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
+
+ // Branch 2: multiply with mm_gate_w
+ ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
+
+ // Apply silu
+ gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
+
+ // Apply mm_4h_to_h_w
+ cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
+
+ // Concatenate with boi and eoi
+ cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
+ cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/conformer.cpp b/llama.cpp/tools/mtmd/models/conformer.cpp
new file mode 100644
index 0000000..9b1fab4
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/conformer.cpp
@@ -0,0 +1,216 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_conformer::build() {
+ const int n_frames = img.nx;
+ const int n_pos = n_frames / 2;
+ const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
+ GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+ ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
+ ggml_set_name(pos_emb, "pos_emb");
+ ggml_set_input(pos_emb);
+ ggml_build_forward_expand(gf, pos_emb);
+
+ ggml_tensor * inp = build_inp_raw(1);
+
+ auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+
+ // pre encode, conv subsampling
+ {
+ // layer.0 - conv2d
+ cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
+ cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
+ cb(cur, "conformer.pre_encode.conv.{}", 0);
+
+ // layer.1 - relu
+ cur = ggml_relu_inplace(ctx0, cur);
+
+ // layer.2 conv2d dw
+ cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
+ cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
+ cb(cur, "conformer.pre_encode.conv.{}", 2);
+
+ // layer.3 conv2d
+ cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
+ cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
+ cb(cur, "conformer.pre_encode.conv.{}", 3);
+
+ // layer.4 - relu
+ cur = ggml_relu_inplace(ctx0, cur);
+
+ // layer.5 conv2d dw
+ cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
+ cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
+ cb(cur, "conformer.pre_encode.conv.{}", 5);
+
+ // layer.6 conv2d
+ cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
+ cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
+ cb(cur, "conformer.pre_encode.conv.{}", 6);
+
+ // layer.7 - relu
+ cur = ggml_relu_inplace(ctx0, cur);
+
+ // flatten channel and frequency axis
+ cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
+
+ // calculate out
+ cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
+ cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
+ cb(cur, "conformer.pre_encode.out", -1);
+ }
+
+ // pos_emb
+ cb(pos_emb, "pos_emb", -1);
+
+ for (int il = 0; il < hparams.n_layer; il++) {
+ const auto & layer = model.layers[il];
+
+ auto * residual = cur;
+
+ cb(cur, "layer.in", il);
+
+ // feed_forward1
+ cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
+ cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
+
+ cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
+ il);
+ cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
+
+ const auto fc_factor = 0.5f;
+ residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
+
+ // self-attention
+ {
+ cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
+ cb(cur, "conformer.layers.{}.norm_self_att", il);
+
+ ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+ Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
+ ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
+ Q_bias_u = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
+ ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
+ Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
+
+ // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
+ ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+ Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+ ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+ Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
+ Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
+
+ // build_attn won't fit due to matrix_ac and matrix_bd separation
+ ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
+ matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
+ cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
+
+ auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
+ cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
+ p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
+ p = ggml_permute(ctx0, p, 0, 2, 1, 3);
+
+ auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
+ matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
+
+ // rel shift
+ {
+ const auto pos_len = matrix_bd->ne[0];
+ const auto q_len = matrix_bd->ne[1];
+ const auto h = matrix_bd->ne[2];
+ matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
+ matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
+ matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
+ matrix_bd = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
+ matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
+ matrix_bd = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
+ }
+
+ matrix_bd = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
+ matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
+ auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
+ scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
+ cb(scores, "conformer.layers.{}.self_attn.id0", il);
+
+ ggml_tensor * attn = ggml_soft_max(ctx0, scores);
+ ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur);
+ x = ggml_permute(ctx0, x, 2, 0, 1, 3);
+ x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
+
+ ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
+ out = ggml_add(ctx0, out, layer.o_b);
+ cb(out, "conformer.layers.{}.self_attn.linear_out", il);
+
+ cur = out;
+ }
+
+ residual = ggml_add(ctx0, residual, cur);
+ cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
+ cb(cur, "conformer.layers.{}.norm_conv", il);
+
+ // conv
+ {
+ auto * x = cur;
+ x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
+ x = ggml_add(ctx0, x, layer.conv_pw1_b);
+ cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
+
+ // ggml_glu doesn't support sigmoid
+ // TODO @ngxson : support this ops in ggml
+ {
+ int64_t d = x->ne[0] / 2;
+ ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
+ x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
+ x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
+ }
+
+ // use ggml_ssm_conv for f32 precision
+ x = ggml_pad(ctx0, x, 4, 0, 0, 0);
+ x = ggml_roll(ctx0, x, 4, 0, 0, 0);
+ x = ggml_pad(ctx0, x, 4, 0, 0, 0);
+ x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
+ x = ggml_add(ctx0, x, layer.conv_dw_b);
+
+ x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
+ x = ggml_silu(ctx0, x);
+
+ // pointwise_conv2
+ x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
+ x = ggml_add(ctx0, x, layer.conv_pw2_b);
+
+ cur = x;
+ }
+
+ residual = ggml_add(ctx0, residual, cur);
+
+ cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
+ cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
+
+ cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
+ FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams
+ cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
+
+ residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
+ cb(residual, "conformer.layers.{}.conv.id", il);
+
+ cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
+ cb(cur, "conformer.layers.{}.norm_out", il);
+ }
+
+ // audio adapter
+ cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
+ cb(cur, "audio_adapter.model.{}", 0);
+ cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
+
+ cb(cur, "projected", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/glm4v.cpp b/llama.cpp/tools/mtmd/models/glm4v.cpp
new file mode 100644
index 0000000..f39b692
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/glm4v.cpp
@@ -0,0 +1,120 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_glm4v::build() {
+ GGML_ASSERT(model.patch_bias != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+ GGML_ASSERT(model.class_embedding == nullptr);
+
+ const int batch_size = 1;
+
+ norm_type norm_t = NORM_TYPE_RMS;
+
+ ggml_tensor * inp_raw = build_inp_raw();
+ ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+ GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+ // second conv dimension
+ {
+ auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+ inp = ggml_add(ctx0, inp, inp_1);
+
+ inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ inp = ggml_reshape_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+ inp = ggml_cont_3d(
+ ctx0, inp,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ // add patch bias
+ inp = ggml_add(ctx0, inp, model.patch_bias);
+ cb(inp, "patch_bias", -1);
+
+ // pos-conv norm
+ inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
+
+ // calculate absolute position embedding and apply
+ ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
+ learned_pos_embd = ggml_cont_4d(
+ ctx0, learned_pos_embd,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ learned_pos_embd = ggml_reshape_4d(
+ ctx0, learned_pos_embd,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+ learned_pos_embd = ggml_cont_3d(
+ ctx0, learned_pos_embd,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ cb(learned_pos_embd, "learned_pos_embd", -1);
+
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+ return ggml_rope_multi(
+ ctx0, cur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
+ 32768, hparams.rope_theta, 1, 0, 1, 32, 1);
+ };
+
+ ggml_tensor * cur = build_vit(
+ inp, n_patches,
+ norm_t,
+ hparams.ffn_op,
+ learned_pos_embd,
+ add_pos);
+
+ cb(cur, "vit_out", -1);
+ // cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
+
+ // GLM4V projector
+ // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
+
+ // patch merger (downsample)
+ {
+ int n_merge = hparams.n_merge;
+ GGML_ASSERT(n_merge > 0);
+
+ int n_token_out = n_patches / n_merge / n_merge;
+ cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
+ cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
+ cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
+
+ cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
+ }
+
+ // FC projector
+ {
+ cur = ggml_mul_mat(ctx0, model.projection, cur);
+ // default LayerNorm (post_projection_norm)
+ cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+ cur = ggml_gelu_erf(ctx0, cur);
+ cb(cur, "after_fc_proj", -1);
+ }
+
+ // FFN projector
+ {
+ cur = build_ffn(cur,
+ model.mm_ffn_up_w, model.mm_ffn_up_b,
+ model.mm_ffn_gate_w, model.mm_ffn_gate_b,
+ model.mm_ffn_down_w, model.mm_ffn_down_b,
+ hparams.ffn_op, -1);
+ cb(cur, "after_ffn_proj", -1);
+ // cb(ggml_sum(ctx0, cur), "merged_sum", -1);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/internvl.cpp b/llama.cpp/tools/mtmd/models/internvl.cpp
new file mode 100644
index 0000000..9aded3b
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/internvl.cpp
@@ -0,0 +1,69 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_internvl::build() {
+ GGML_ASSERT(model.class_embedding != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+
+ const int n_pos = n_patches + 1;
+ ggml_tensor * inp = build_inp();
+
+ // add CLS token
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+ // The larger models use a different ViT, which uses RMS norm instead of layer norm
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
+ norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
+ ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
+ : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
+
+ ggml_tensor * cur = build_vit(
+ inp, n_pos,
+ norm_t,
+ hparams.ffn_op,
+ model.position_embeddings,
+ nullptr);
+
+ // remove CLS token
+ cur = ggml_view_2d(ctx0, cur,
+ n_embd, n_patches,
+ ggml_row_size(cur->type, n_embd), 0);
+
+ // pixel shuffle
+ {
+ const int scale_factor = model.hparams.n_merge;
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
+ const int height = n_patches_y;
+ const int width = n_patches_x;
+ GGML_ASSERT(scale_factor > 0);
+ cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+ cur = ggml_cont_4d(ctx0, cur,
+ n_embd * scale_factor * scale_factor,
+ height / scale_factor,
+ width / scale_factor,
+ bsz);
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+ // flatten to 2D
+ cur = ggml_cont_2d(ctx0, cur,
+ n_embd * scale_factor * scale_factor,
+ cur->ne[1] * cur->ne[2]);
+ }
+
+ // projector (always using GELU activation)
+ {
+ // projector LayerNorm uses pytorch's default eps = 1e-5
+ // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
+ cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_3_w, model.mm_3_b,
+ FFN_GELU,
+ -1);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/kimik25.cpp b/llama.cpp/tools/mtmd/models/kimik25.cpp
new file mode 100644
index 0000000..cf9f27f
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/kimik25.cpp
@@ -0,0 +1,101 @@
+#include "models.h"
+#include <cstring>
+#include <cmath>
+
+// note: this is similar to clip_graph::resize_position_embeddings, major difference is having
+// the w/h in ne[1] and ne[2] instead of assuming with sqrt. Could try storing the tensor in 2D instead
+// with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3).
+ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) {
+ ggml_tensor * pos_embd = model.position_embeddings;
+ const int height = img.ny / patch_size;
+ const int width = img.nx / patch_size;
+ const uint32_t mode = interpolation_mode;
+
+ GGML_ASSERT(pos_embd);
+
+ const int64_t stored_c = pos_embd->ne[0]; // C = 1152
+ const int64_t orig_w = pos_embd->ne[1]; // W = 64
+ const int64_t orig_h = pos_embd->ne[2]; // H = 64
+
+ GGML_ASSERT(stored_c == n_embd);
+
+ if (height == (int)orig_h && width == (int)orig_w) {
+ // No interpolation needed, just flatten to [C, H*W]
+ return ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);
+ }
+
+ pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3);
+ pos_embd = ggml_interpolate(ctx0, pos_embd, height, width, n_embd, 1, mode);
+ pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3);
+ pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);
+ return pos_embd;
+}
+
+ggml_cgraph * clip_graph_kimik25::build() {
+ ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_h, "pos_h");
+ ggml_set_input(pos_h);
+
+ ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_w, "pos_w");
+ ggml_set_input(pos_w);
+
+ ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC);
+
+ // Kimi-K2.5 uses interleaved 2D RoPE pattern natively, but
+ // Q / K are permuted during conversion to use split format.
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+ cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+ return cur;
+ };
+
+ ggml_tensor * inp = build_inp();
+
+ // I don't know why, but doing this in the build_vit lead to the ggml_add not occurring?
+ // Doing it manually here does work.
+ inp = ggml_add(ctx0, inp, learned_pos_embd);
+
+ ggml_tensor * cur = build_vit(
+ inp, n_patches,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ nullptr,
+ add_pos);
+
+ cb(cur, "vit_out", -1);
+
+ {
+ // patch_merger
+ const int scale_factor = model.hparams.n_merge;
+ cur = build_patch_merge_permute(cur, scale_factor);
+
+ // projection norm
+ int proj_inp_dim = cur->ne[0];
+ int n_merged_patches = cur->ne[1];
+ cur = ggml_view_2d(ctx0, cur,
+ n_embd, n_merged_patches * scale_factor * scale_factor,
+ ggml_row_size(cur->type, n_embd), 0);
+ cur = ggml_norm(ctx0, cur, hparams.eps);
+ cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+ cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+ cur = ggml_view_2d(ctx0, cur,
+ proj_inp_dim, n_merged_patches,
+ ggml_row_size(cur->type, proj_inp_dim), 0);
+ cb(cur, "proj_inp_normed", -1);
+
+ // projection mlp
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
+
+ cb(cur, "proj_out", -1);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/kimivl.cpp b/llama.cpp/tools/mtmd/models/kimivl.cpp
new file mode 100644
index 0000000..0a06f50
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/kimivl.cpp
@@ -0,0 +1,63 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_kimivl::build() {
+ // 2D input positions
+ ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_h, "pos_h");
+ ggml_set_input(pos_h);
+
+ ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_w, "pos_w");
+ ggml_set_input(pos_w);
+
+ ggml_tensor * learned_pos_embd = resize_position_embeddings();
+
+ // build ViT with 2D position embeddings
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+ // first half is X axis and second half is Y axis
+ return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+ };
+
+ ggml_tensor * inp = build_inp();
+ ggml_tensor * cur = build_vit(
+ inp, n_patches,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ learned_pos_embd,
+ add_pos);
+
+ cb(cur, "vit_out", -1);
+
+ {
+ // patch_merger
+ const int scale_factor = model.hparams.n_merge;
+ cur = build_patch_merge_permute(cur, scale_factor);
+
+ // projection norm
+ int proj_inp_dim = cur->ne[0];
+ cur = ggml_view_2d(ctx0, cur,
+ n_embd, cur->ne[1] * scale_factor * scale_factor,
+ ggml_row_size(cur->type, n_embd), 0);
+ cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+ cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+ cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+ cur = ggml_view_2d(ctx0, cur,
+ proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
+ ggml_row_size(cur->type, proj_inp_dim), 0);
+ cb(cur, "proj_inp_normed", -1);
+
+ // projection mlp
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
+ cb(cur, "proj_out", -1);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/llama4.cpp b/llama.cpp/tools/mtmd/models/llama4.cpp
new file mode 100644
index 0000000..30d1df5
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/llama4.cpp
@@ -0,0 +1,96 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_llama4::build() {
+ GGML_ASSERT(model.class_embedding != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+
+ const int n_pos = n_patches + 1; // +1 for [CLS]
+
+ // 2D input positions
+ ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+ ggml_set_name(pos_h, "pos_h");
+ ggml_set_input(pos_h);
+
+ ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+ ggml_set_name(pos_w, "pos_w");
+ ggml_set_input(pos_w);
+
+ ggml_tensor * inp = build_inp_raw();
+
+ // Llama4UnfoldConvolution
+ {
+ ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
+ patch_size, patch_size, 3, n_embd);
+ inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
+ inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+ inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+ cb(inp, "patch_conv", -1);
+ }
+
+ // add CLS token
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+ // build ViT with 2D position embeddings
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+ // first half is X axis and second half is Y axis
+ // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
+ // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
+ return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+ };
+ ggml_tensor * cur = build_vit(
+ inp, n_pos,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ model.position_embeddings,
+ add_pos);
+
+ // remove CLS token
+ cur = ggml_view_2d(ctx0, cur,
+ n_embd, n_patches,
+ ggml_row_size(cur->type, n_embd), 0);
+
+ // pixel shuffle
+ // based on Llama4VisionPixelShuffleMLP
+ // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
+ {
+ const int scale_factor = model.hparams.n_merge;
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
+ GGML_ASSERT(scale_factor > 0);
+ GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
+ cur = ggml_reshape_4d(ctx0, cur,
+ n_embd * scale_factor,
+ n_patches_x / scale_factor,
+ n_patches_y,
+ bsz);
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+ cur = ggml_cont_4d(ctx0, cur,
+ n_embd * scale_factor * scale_factor,
+ n_patches_x / scale_factor,
+ n_patches_y / scale_factor,
+ bsz);
+ //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+ // flatten to 2D
+ cur = ggml_cont_2d(ctx0, cur,
+ n_embd * scale_factor * scale_factor,
+ n_patches / scale_factor / scale_factor);
+ cb(cur, "pixel_shuffle", -1);
+ }
+
+ // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
+ {
+ cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
+ cur = ggml_gelu(ctx0, cur);
+ cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
+ cur = ggml_gelu(ctx0, cur);
+ cb(cur, "adapter_mlp", -1);
+ }
+
+ // Llama4MultiModalProjector
+ cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+ cb(cur, "projected", -1);
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/llava.cpp b/llama.cpp/tools/mtmd/models/llava.cpp
new file mode 100644
index 0000000..0bfb5f0
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/llava.cpp
@@ -0,0 +1,374 @@
+#include "models.h"
+
+// this graph is used by llava, granite and glm
+// due to having embedding_stack (used by granite), we cannot reuse build_vit
+ggml_cgraph * clip_graph_llava::build() {
+ const int batch_size = 1;
+ const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
+
+ GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
+
+ // Calculate the deepest feature layer based on hparams and projector type
+ int max_feature_layer = n_layer;
+ {
+ // Get the index of the second to last layer; this is the default for models that have a llava projector
+ int il_last = hparams.n_layer - 1;
+ int deepest_feature_layer = -1;
+
+ if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+ il_last += 1;
+ }
+
+ // If we set explicit vision feature layers, only go up to the deepest one
+ // NOTE: only used by granite-vision models for now
+ for (const auto & feature_layer : hparams.vision_feature_layer) {
+ if (feature_layer > deepest_feature_layer) {
+ deepest_feature_layer = feature_layer;
+ }
+ }
+ max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
+ }
+
+ ggml_tensor * inp = build_inp();
+
+ // concat class_embeddings and patch_embeddings
+ if (model.class_embedding) {
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+ }
+
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
+
+ ggml_tensor * inpL = inp;
+
+ // pre-layernorm
+ if (model.pre_ln_w) {
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
+ cb(inpL, "pre_ln", -1);
+ }
+
+ std::vector<ggml_tensor *> embedding_stack;
+ const auto & vision_feature_layer = hparams.vision_feature_layer;
+
+ // loop over layers
+ for (int il = 0; il < max_feature_layer; il++) {
+ auto & layer = model.layers[il];
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+ // If this is an embedding feature layer, save the output.
+ // NOTE: 0 index here refers to the input to the encoder.
+ if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
+ embedding_stack.push_back(cur);
+ }
+
+ // layernorm1
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+ cb(cur, "layer_inp_normed", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+ if (layer.q_b) {
+ Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+ }
+
+ ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+ if (layer.k_b) {
+ Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+ }
+
+ ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+ if (layer.v_b) {
+ Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ // re-add the layer input, e.g., residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ inpL = cur; // inpL = residual, cur = hidden_states
+
+ cb(cur, "ffn_inp", il);
+
+ // layernorm2
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+ cb(cur, "ffn_inp_normed", il);
+
+ // ffn
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ layer.ff_gate_w, layer.ff_gate_b,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ cb(cur, "ffn_out", il);
+
+ // residual 2
+ cur = ggml_add(ctx0, inpL, cur);
+ cb(cur, "layer_out", il);
+
+ inpL = cur;
+ }
+
+ // post-layernorm
+ if (model.post_ln_w) {
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
+ }
+
+ ggml_tensor * embeddings = inpL;
+
+ // process vision feature layers (used by granite)
+ {
+ // final layer is a vision feature layer
+ if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
+ embedding_stack.push_back(inpL);
+ }
+
+ // If feature layers are explicitly set, stack them (if we have multiple)
+ if (!embedding_stack.empty()) {
+ embeddings = embedding_stack[0];
+ for (size_t i = 1; i < embedding_stack.size(); i++) {
+ embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
+ }
+ }
+ }
+
+ // llava projector (also used by granite)
+ if (hparams.has_llava_projector) {
+ embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
+
+ ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(patches, "patches");
+ ggml_set_input(patches);
+
+ // shape [1, 576, 1024]
+ // ne is whcn, ne = [1024, 576, 1, 1]
+ embeddings = ggml_get_rows(ctx0, embeddings, patches);
+
+ // print_tensor_info(embeddings, "embeddings");
+
+ // llava projector
+ if (proj_type == PROJECTOR_TYPE_MLP) {
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+ embeddings = ggml_gelu(ctx0, embeddings);
+ if (model.mm_2_w) {
+ embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+ }
+ }
+ else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+ // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+ // First LayerNorm
+ embeddings = ggml_norm(ctx0, embeddings, eps);
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+ model.mm_1_b);
+
+ // GELU activation
+ embeddings = ggml_gelu(ctx0, embeddings);
+
+ // Second linear layer
+ embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+ // Second LayerNorm
+ embeddings = ggml_norm(ctx0, embeddings, eps);
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+ model.mm_4_b);
+ }
+ else if (proj_type == PROJECTOR_TYPE_LDP) {
+ // MobileVLM projector
+ int n_patch = 24;
+ ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+ mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+ mlp_1 = ggml_gelu(ctx0, mlp_1);
+ ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+ mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+ // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
+
+ // block 1
+ ggml_tensor * block_1 = nullptr;
+ {
+ // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+ mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
+ mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+ // stride = 1, padding = 1, bias is nullptr
+ block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+
+ // layer norm
+ // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+ // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+ block_1 = ggml_norm(ctx0, block_1, eps);
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+
+ // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+ // hardswish
+ ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+ block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+ // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+ // pointwise conv
+ block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+ block_1 = ggml_relu(ctx0, block_1);
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+ block_1 = ggml_hardsigmoid(ctx0, block_1);
+ // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+ block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+ block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+ int w = block_1->ne[0], h = block_1->ne[1];
+ block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+ block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+ // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+ block_1 = ggml_norm(ctx0, block_1, eps);
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+ // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+ // residual
+ block_1 = ggml_add(ctx0, mlp_3, block_1);
+ }
+
+ // block_2
+ {
+ // stride = 2
+ block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+
+ // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+ // layer norm
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+ // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+ block_1 = ggml_norm(ctx0, block_1, eps);
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+ // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+ // hardswish
+ ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+ // not sure the parameters is right for globalAvgPooling
+ block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+ // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+ // pointwise conv
+ block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+ block_1 = ggml_relu(ctx0, block_1);
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+ block_1 = ggml_hardsigmoid(ctx0, block_1);
+
+ // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+ block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+ block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+ int w = block_1->ne[0], h = block_1->ne[1];
+ block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+ block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
+ // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+ block_1 = ggml_norm(ctx0, block_1, eps);
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
+ block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+ // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+ }
+ embeddings = block_1;
+ }
+ else if (proj_type == PROJECTOR_TYPE_LDPV2)
+ {
+ int n_patch = 24;
+ ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+ mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
+ mlp_0 = ggml_gelu(ctx0, mlp_0);
+ ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+ mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
+ // mlp_2 ne = [2048, 576, 1, 1]
+ // // AVG Pool Layer 2*2, strides = 2
+ mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
+ // mlp_2 ne = [576, 2048, 1, 1]
+ mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+ // mlp_2 ne [24, 24, 2048, 1]
+ mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
+ // weight ne = [3, 3, 2048, 1]
+ ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+ peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
+ peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+ mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
+ peg_0 = ggml_add(ctx0, peg_0, mlp_2);
+ peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
+ embeddings = peg_0;
+ }
+ else {
+ GGML_ABORT("fatal error");
+ }
+ }
+
+ // glm projector
+ else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+ size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
+ embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
+ embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
+ embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
+ embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
+ embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
+ embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
+ // GLU
+ {
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+ embeddings = ggml_norm(ctx0, embeddings, eps);
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+ embeddings = ggml_gelu_inplace(ctx0, embeddings);
+ ggml_tensor * x = embeddings;
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
+ x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
+ embeddings = ggml_swiglu_split(ctx0, embeddings, x);
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
+ }
+ // arrangement of BOI/EOI token embeddings
+ // note: these embeddings are not present in text model, hence we cannot process them as text tokens
+ // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
+ {
+ embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
+ embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
+ }
+ }
+
+ else {
+ GGML_ABORT("llava: unknown projector type");
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/minicpmv.cpp b/llama.cpp/tools/mtmd/models/minicpmv.cpp
new file mode 100644
index 0000000..3594ea2
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/minicpmv.cpp
@@ -0,0 +1,114 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_minicpmv::build() {
+ GGML_ASSERT(model.class_embedding == nullptr);
+ const int n_pos = n_patches;
+ const int n_embd_proj = n_mmproj_embd;
+
+ // position embeddings for the projector (not for ViT)
+ // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
+ // base frequency omega
+ ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
+ ggml_set_name(omega, "omega");
+ ggml_set_input(omega);
+
+ // 2D input positions (using float for sinusoidal embeddings)
+ ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+ ggml_set_name(pos_h, "pos_h");
+ ggml_set_input(pos_h);
+ ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+ ggml_set_name(pos_w, "pos_w");
+ ggml_set_input(pos_w);
+
+ // for selecting learned pos embd, used by ViT
+ struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
+
+ ggml_tensor * inp = build_inp();
+ ggml_tensor * embeddings = build_vit(
+ inp, n_pos,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ learned_pos_embd,
+ nullptr);
+
+ // resampler projector (it is just another transformer)
+
+ ggml_tensor * q = model.mm_model_query;
+ ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+
+ // norm
+ q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
+ v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
+
+ // calculate sinusoidal pos embd
+ ggml_tensor * pos_embed = nullptr;
+ {
+ // outer product
+ ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
+ ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
+ ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
+ // sin and cos
+ ggml_tensor * pos_embd_x = ggml_concat(
+ ctx0,
+ ggml_sin(ctx0, theta_x),
+ ggml_cos(ctx0, theta_x),
+ 0 // concat on first dim
+ );
+ ggml_tensor * pos_embd_y = ggml_concat(
+ ctx0,
+ ggml_sin(ctx0, theta_y),
+ ggml_cos(ctx0, theta_y),
+ 0 // concat on first dim
+ );
+ pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
+ }
+
+ // k = v + pos_embed
+ ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
+
+ // attention
+ {
+ const int d_head = 128;
+ int n_head = n_embd_proj/d_head;
+ // Use actual config value if available, otherwise fall back to hardcoded values
+ int num_query = hparams.minicpmv_query_num;
+ ggml_tensor * Q = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
+ model.mm_model_attn_q_b);
+ ggml_tensor * K = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
+ model.mm_model_attn_k_b);
+ ggml_tensor * V = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
+ model.mm_model_attn_v_b);
+
+ Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
+ K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
+ V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
+
+ cb(Q, "resampler_Q", -1);
+ cb(K, "resampler_K", -1);
+ cb(V, "resampler_V", -1);
+
+ float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
+ embeddings = build_attn(
+ model.mm_model_attn_o_w,
+ model.mm_model_attn_o_b,
+ Q, K, V, nullptr, resampler_kq_scale, -1);
+ cb(embeddings, "resampler_attn_out", -1);
+ }
+ // layernorm
+ embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
+
+ // projection
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+
+ // build the graph
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/mobilenetv5.cpp b/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
new file mode 100644
index 0000000..593afa1
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
@@ -0,0 +1,451 @@
+#include "models.h"
+
+// Helpers for MobileNetV5 Blocks
+// RMS Norm 2D - normalizes over channels for each spatial position
+ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
+ // inp: [W, H, C, B]
+
+ ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
+ cur = ggml_cont(ctx0, cur);
+ cur = ggml_rms_norm(ctx0, cur, eps);
+
+ if (weight) {
+ cur = ggml_mul(ctx0, cur, weight);
+ }
+
+ cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
+ cur = ggml_cont(ctx0, cur);
+
+ return cur;
+}
+
+// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
+ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
+ const int64_t ih = inp->ne[1]; // height
+ const int64_t iw = inp->ne[0]; // width
+
+ // Calculate output size (ceil division)
+ const int64_t oh = (ih + stride_h - 1) / stride_h;
+ const int64_t ow = (iw + stride_w - 1) / stride_w;
+
+ // Calculate padding needed
+ const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
+ const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
+
+ // Split padding asymmetrically
+ const int pad_h_top = pad_h / 2;
+ const int pad_h_bottom = pad_h - pad_h_top;
+ const int pad_w_left = pad_w / 2;
+ const int pad_w_right = pad_w - pad_w_left;
+
+ // Apply padding if needed
+ // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+ // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
+ if (pad_h > 0 || pad_w > 0) {
+ inp = ggml_pad_ext(ctx0, inp,
+ pad_w_left, pad_w_right, // width padding (dim 0)
+ pad_h_top, pad_h_bottom, // height padding (dim 1)
+ 0, 0, // no channel padding (dim 2)
+ 0, 0); // no batch padding (dim 3)
+ }
+
+ return inp;
+}
+
+
+// Edge Residual Block (Stage 0)
+ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+ ggml_tensor * cur = inp;
+
+ // 1. Expansion Conv (3x3)
+ if (stride == 2) {
+ // Case: Downsampling (Block 0)
+ // Replicates Conv2dSame(kernel=3, stride=2)
+ cur = pad_same_2d(cur, 3, 3, stride, stride);
+ cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
+ } else {
+ // Case: Normal 3x3 Block (Block 1, 2)
+ // Replicates Conv2d(kernel=3, stride=1, padding=1)
+ cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
+ }
+
+ // BN + Activation
+ if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
+ cur = ggml_gelu(ctx0, cur);
+
+ // 2. Pointwise Linear Conv (1x1)
+ // 1x1 Convs usually have padding=0 and stride=1
+ cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
+ if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
+
+ // 3. Residual Connection
+ // Only apply residual if spatial dimensions and channels match (stride 1)
+ if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
+ cur = ggml_add(ctx0, cur, inp);
+ }
+
+ return cur;
+}
+
+// Universal Inverted Residual Block (Stage 1+)
+ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+ ggml_tensor * cur = inp;
+
+ // 1. Depthwise Start (Optional)
+ // NOTE: dw_start always has stride=1 (no downsampling here)
+ if (block.dw_start_w) {
+ int k = block.dw_start_w->ne[0]; // 3 or 5
+ int p = k / 2;
+ cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
+ if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
+ }
+
+ // 2. Pointwise Expansion (1x1)
+ if (block.pw_exp_w) {
+ // Standard 1x1 conv, pad=0, stride=1
+ cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
+ if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
+ cur = ggml_gelu(ctx0, cur);
+ }
+
+ // 3. Depthwise Mid (Optional)
+ // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
+ if (block.dw_mid_w) {
+ int k = block.dw_mid_w->ne[0]; // 3 or 5
+
+ if (stride > 1) {
+ // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
+ cur = pad_same_2d(cur, k, k, stride, stride);
+ cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
+ } else {
+ // Case: Stride 1 -> Use Standard Symmetric Padding
+ int p = k / 2;
+ cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
+ }
+
+ if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
+ cur = ggml_gelu(ctx0, cur);
+ }
+
+ // 4. Pointwise Projection (1x1)
+ if (block.pw_proj_w) {
+ cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
+ if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
+ }
+
+ // Apply Layer Scaling if present
+ if (block.layer_scale_w) {
+ cur = ggml_mul(ctx0, cur, block.layer_scale_w);
+ }
+
+ // 5. Residual Connection
+ bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
+ bool same_channel = (inp->ne[2] == cur->ne[2]);
+ if (same_spatial && same_channel) {
+ cur = ggml_add(ctx0, cur, inp);
+ }
+
+ return cur;
+}
+
+// Attention Block (MQA)
+ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
+ ggml_tensor * cur = inp;
+
+ // Norm
+ if (block.attn_norm_w) {
+ cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
+ }
+
+ // 1. Q Calculation
+ ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
+
+ // 2. K Calculation (Downsampled)
+ // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+ ggml_tensor * k_inp = cur;
+ if (block.attn_k_dw_w) {
+ int k_size = block.attn_k_dw_w->ne[0]; // Usually 3
+ k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding
+ k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0
+ if (block.attn_k_norm_w) {
+ k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
+ }
+ }
+ ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
+
+ // 3. V Calculation (Downsampled)
+ // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+ ggml_tensor * v_inp = cur;
+ if (block.attn_v_dw_w) {
+ int v_size = block.attn_v_dw_w->ne[0]; // Usually 3
+ v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding
+ v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0
+ if (block.attn_v_norm_w) {
+ v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
+ }
+ }
+ ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
+
+ const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
+ const int D = k->ne[2]; // Head dimension
+ const int n_head = q->ne[2] / D;
+ const int N = W * H;
+
+ // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
+ q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
+ q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
+ q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
+ q = ggml_cont(ctx0, q);
+
+ const int Wk = k->ne[0]; const int Hk = k->ne[1];
+ const int M = Wk * Hk;
+
+ // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
+ k = ggml_reshape_3d(ctx0, k, M, D, B);
+ k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
+ k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
+ k = ggml_cont(ctx0, k);
+
+ // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
+ v = ggml_reshape_3d(ctx0, v, M, D, B);
+ v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
+ v = ggml_cont(ctx0, v); // [M, D, 1, B]
+
+ // Multi-Query Attention
+ float scale = 1.0f / sqrtf((float)D);
+
+ // Step 1: Compute Q @ K.T
+ ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
+
+ scores = ggml_scale(ctx0, scores, scale);
+
+ scores = ggml_soft_max(ctx0, scores);
+
+ ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
+
+ kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
+ kqv = ggml_cont(ctx0, kqv);
+
+
+ kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
+ kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
+ kqv = ggml_cont(ctx0, kqv);
+
+ // Output projection
+ cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
+
+ // Residual & Layer Scale
+ if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
+ if (block.layer_scale_w) {
+ cur = ggml_mul(ctx0, cur, block.layer_scale_w);
+ }
+ cur = ggml_add(ctx0, cur, inp);
+ }
+
+ return cur;
+}
+
+ggml_cgraph * clip_graph_mobilenetv5::build() {
+ ggml_tensor * inp = build_inp_raw();
+
+ // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
+ ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding
+
+ cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0
+ if (model.mobilenet_stem_conv_b) {
+ cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
+ }
+ if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
+ cur = ggml_gelu(ctx0, cur);
+
+
+ // 2. Blocks
+ std::vector<ggml_tensor*> intermediate_features;
+ const int total_blocks = model.mobilenet_blocks.size();
+
+ auto is_stage_start = [&](int i) {
+ if (i == 0) return true;
+ for (int end_idx : model.mobilenet_stage_ends) {
+ if (i == end_idx + 1) return true;
+ }
+ return false;
+ };
+
+ auto is_fusion_point = [&](int i) {
+ if (model.mobilenet_stage_ends.size() >= 4) {
+ if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
+ if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
+ } else {
+ if (i == total_blocks - 1) return true;
+ }
+ return false;
+ };
+
+ for (int i = 0; i < total_blocks; i++) {
+ const auto & block = model.mobilenet_blocks[i];
+ int stride = is_stage_start(i) ? 2 : 1;
+
+ if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride);
+ else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block);
+ else cur = build_inverted_residual(cur, block, stride);
+
+ if (is_fusion_point(i)) {
+
+ intermediate_features.push_back(cur);
+ }
+ }
+
+ // 3. Multi-Scale Fusion Adapter (MSFA)
+ if (!intermediate_features.empty()) {
+
+ // A. Reference Resolution: PyTorch implementation uses inputs[0]
+ // We assume intermediate_features[0] is the "High Resolution" target.
+ // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
+ ggml_tensor* target_feat = intermediate_features[0];
+ int high_res_w = target_feat->ne[0];
+ int high_res_h = target_feat->ne[1];
+
+ std::vector<ggml_tensor*> resized_feats;
+
+ // B. Resize inputs to match inputs[0] (High Resolution)
+ for (auto feat : intermediate_features) {
+ int feat_w = feat->ne[0];
+ int feat_h = feat->ne[1];
+
+ // PyTorch: if feat_size < high_resolution: interpolate
+ if (feat_w < high_res_w || feat_h < high_res_h) {
+ // Calculate scale factor.
+ // Note: PyTorch 'nearest' works on arbitrary float scales.
+ // ggml_upscale generally takes integer factors or target sizes depending on helper.
+ // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
+ int scale_w = high_res_w / feat_w;
+ // int scale_h = high_res_h / feat_h;
+
+ // Safety check for non-integer scaling if strictly replicating
+ GGML_ASSERT(high_res_w % feat_w == 0);
+
+ // Upsample (Nearest Neighbor)
+ // 2 is the scale factor
+ feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
+ }
+ resized_feats.push_back(feat);
+ }
+
+ // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
+ cur = resized_feats[0];
+ for (size_t k = 1; k < resized_feats.size(); ++k) {
+ cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
+ }
+
+ // D. FFN (UniversalInvertedResidual)
+ // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
+
+ // 1. Expansion
+ if (model.msfa_ffn_expand_w) {
+ // 1x1 Conv
+ cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
+
+ if (model.msfa_ffn_expand_bn) {
+ cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
+ }
+
+ cur = ggml_gelu(ctx0, cur);
+
+ }
+
+ // 2. Projection (No DW because kernel_size=0)
+ if (model.msfa_ffn_project_w) {
+ // 1x1 Conv
+ cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
+
+ // UniversalInvertedResidual typically has a norm after projection
+ if (model.msfa_ffn_project_bn) {
+ cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
+ }
+
+ }
+
+ // E. Final Downsample to Target Resolution (Output Resolution)
+ // PyTorch: matches self.output_resolution (e.g. 16x16)
+ const int target_out_res = 16;
+ int current_w = cur->ne[0];
+
+ if (current_w > target_out_res) {
+ int s = current_w / target_out_res;
+
+ GGML_ASSERT(current_w % target_out_res == 0);
+
+ // Avg Pool: Kernel=s, Stride=s
+ cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
+
+ }
+
+ // F. Final Norm
+ if (model.msfa_concat_norm_w) {
+ cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
+
+ }
+ }
+
+ // 4. Gemma 3n Multimodal Projection (Embedder)
+ // Input: 'cur' is [Width, Height, Channels, Batch]
+ int W = cur->ne[0];
+ int H = cur->ne[1];
+ int C = cur->ne[2];
+ int B = cur->ne[3];
+
+ GGML_ASSERT(C == hparams.n_embd);
+
+ // 1. Permute and Flatten to [Channels, Tokens, Batch]
+ // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
+ cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
+ cur = ggml_cont(ctx0, cur);
+ cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
+ cur = ggml_cont(ctx0, cur);
+
+
+ // 2. FEATURE SCALING
+ // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
+ const float scale_factor = sqrtf((float)C);
+ cur = ggml_scale(ctx0, cur, scale_factor);
+
+
+ // 3. SOFT EMBEDDING NORM
+ // PyTorch: self._norm(x) * self.weight
+ // We must normalize regardless, then multiply if weight exists.
+ {
+ const float eps = 1e-6f; // Gemma3n uses 1e-6
+ cur = ggml_rms_norm(ctx0, cur, eps);
+
+ if (model.mm_soft_emb_norm_w) {
+ // Weight shape is (2048,) -> Element-wise broadcast multiply
+ cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+ }
+
+ }
+
+ // 4. PROJECTION
+ // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
+ // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
+ if (model.mm_input_proj_w) {
+ cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
+ }
+
+ // 5. POST PROJECTION NORM
+ // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
+ // with_scale=False means weight is registered as buffer with value 1.0
+ // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
+ {
+ const float eps = 1e-6f;
+ cur = ggml_rms_norm(ctx0, cur, eps);
+
+ if (model.mm_post_proj_norm_w) {
+ // If weight is loaded, multiply (should be ~1.0 anyway)
+ cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
+ }
+ }
+
+ ggml_build_forward_expand(gf, cur);
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/models.h b/llama.cpp/tools/mtmd/models/models.h
new file mode 100644
index 0000000..c4c67ac
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/models.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include "../clip-graph.h"
+
+/*
+ * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
+ * We encourage human contributors to ensure the quality and reliability of the codebase.
+ */
+
+struct clip_graph_siglip : clip_graph {
+ clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_pixtral : clip_graph {
+ clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_qwen2vl : clip_graph {
+ clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_qwen3vl : clip_graph {
+ clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_youtuvl : clip_graph {
+ clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_minicpmv : clip_graph {
+ clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_internvl : clip_graph {
+ clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_llama4 : clip_graph {
+ clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_kimivl : clip_graph {
+ clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_cogvlm : clip_graph {
+ clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_llava : clip_graph {
+ clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_whisper_enc : clip_graph {
+ clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_conformer : clip_graph {
+ clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_glm4v : clip_graph {
+ clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
+struct clip_graph_mobilenetv5 : clip_graph {
+ clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+
+ ggml_tensor * rms_norm_2d(
+ ggml_tensor * inp,
+ ggml_tensor * weight,
+ float eps = 1e-6f);
+
+ ggml_tensor* pad_same_2d(
+ ggml_tensor* inp,
+ int kernel_h,
+ int kernel_w,
+ int stride_h,
+ int stride_w,
+ int dilation_h = 1,
+ int dilation_w = 1);
+
+ ggml_tensor * build_edge_residual(
+ ggml_tensor * inp,
+ const mobilenetv5_block & block,
+ int stride);
+
+ ggml_tensor * build_inverted_residual(
+ ggml_tensor * inp,
+ const mobilenetv5_block & block,
+ int stride);
+
+ ggml_tensor * build_mobilenet_attn(
+ ggml_tensor * inp,
+ const mobilenetv5_block & block);
+};
+
+struct clip_graph_kimik25 : clip_graph {
+ clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+
+ ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode);
+};
diff --git a/llama.cpp/tools/mtmd/models/pixtral.cpp b/llama.cpp/tools/mtmd/models/pixtral.cpp
new file mode 100644
index 0000000..a849210
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/pixtral.cpp
@@ -0,0 +1,86 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_pixtral::build() {
+ const int n_merge = hparams.n_merge;
+
+ // 2D input positions
+ ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_h, "pos_h");
+ ggml_set_input(pos_h);
+
+ ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+ ggml_set_name(pos_w, "pos_w");
+ ggml_set_input(pos_w);
+
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+ return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
+ };
+
+ ggml_tensor * inp = build_inp();
+ ggml_tensor * cur = build_vit(
+ inp, n_patches,
+ NORM_TYPE_RMS,
+ hparams.ffn_op,
+ nullptr, // no learned pos embd
+ add_pos);
+
+ // mistral small 3.1 patch merger
+ // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
+ if (model.mm_patch_merger_w) {
+ GGML_ASSERT(hparams.n_merge > 0);
+
+ cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
+
+ // reshape image tokens to 2D grid
+ cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
+ cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
+ cur = ggml_cont(ctx0, cur);
+
+ // torch.nn.functional.unfold is just an im2col under the hood
+ // we just need a dummy kernel to make it work
+ ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
+ cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
+
+ // project to n_embd
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+ cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
+ }
+
+ // LlavaMultiModalProjector (always using GELU activation)
+ {
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
+ }
+
+ // arrangement of the [IMG_BREAK] token
+ if (model.token_embd_img_break) {
+ // not efficient, but works
+ // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
+ // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
+ // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
+
+ const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
+ const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
+ const int p_total = p_x * p_y;
+ const int n_embd_text = cur->ne[0];
+ const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
+
+ ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
+ ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
+ tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
+ tok = ggml_add(ctx0, tok, model.token_embd_img_break);
+ tmp = ggml_concat(ctx0, tmp, tok, 1);
+ cur = ggml_view_2d(ctx0, tmp,
+ n_embd_text, n_tokens_output,
+ ggml_row_size(tmp->type, n_embd_text), 0);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/qwen2vl.cpp b/llama.cpp/tools/mtmd/models/qwen2vl.cpp
new file mode 100644
index 0000000..85f158b
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/qwen2vl.cpp
@@ -0,0 +1,183 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_qwen2vl::build() {
+ GGML_ASSERT(model.patch_bias == nullptr);
+ GGML_ASSERT(model.class_embedding == nullptr);
+
+ const int batch_size = 1;
+ const bool use_window_attn = hparams.n_wa_pattern > 0;
+ const int n_wa_pattern = hparams.n_wa_pattern;
+ const int n_pos = n_patches;
+ const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+ norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL
+ ? NORM_TYPE_RMS // qwen 2.5 vl
+ : NORM_TYPE_NORMAL; // qwen 2 vl
+
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+ ggml_tensor * inp_raw = build_inp_raw();
+ ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+ GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+ GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+ // second conv dimension
+ {
+ auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+ inp = ggml_add(ctx0, inp, inp_1);
+
+ inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ inp = ggml_reshape_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+ inp = ggml_cont_3d(
+ ctx0, inp,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ ggml_tensor * inpL = inp;
+ ggml_tensor * window_mask = nullptr;
+ ggml_tensor * window_idx = nullptr;
+ ggml_tensor * inv_window_idx = nullptr;
+
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ // pre-layernorm
+ if (model.pre_ln_w) {
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+ }
+
+ if (use_window_attn) {
+ // handle window attention inputs
+ inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+ ggml_set_name(inv_window_idx, "inv_window_idx");
+ ggml_set_input(inv_window_idx);
+ // mask for window attention
+ window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
+ ggml_set_name(window_mask, "window_mask");
+ ggml_set_input(window_mask);
+
+ // if flash attn is used, we need to pad the mask and cast to f16
+ if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+ window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+ }
+
+ // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+ GGML_ASSERT(batch_size == 1);
+ inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
+ inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ // loop over layers
+ for (int il = 0; il < n_layer; il++) {
+ const auto & layer = model.layers[il];
+ const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
+
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+ // layernorm1
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+ cb(cur, "ln1", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
+ ggml_tensor * Kcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
+ ggml_tensor * Vcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // apply M-RoPE
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+ cb(Qcur, "Qcur_rope", il);
+ cb(Kcur, "Kcur_rope", il);
+
+ ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ // re-add the layer input, e.g., residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ inpL = cur; // inpL = residual, cur = hidden_states
+
+ cb(cur, "ffn_inp", il);
+
+ // layernorm2
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+ cb(cur, "ffn_inp_normed", il);
+
+ // ffn
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ layer.ff_gate_w, layer.ff_gate_b,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ cb(cur, "ffn_out", il);
+
+ // residual 2
+ cur = ggml_add(ctx0, inpL, cur);
+ cb(cur, "layer_out", il);
+
+ inpL = cur;
+ }
+
+ // post-layernorm
+ if (model.post_ln_w) {
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+ }
+
+ // multimodal projection
+ ggml_tensor * embeddings = inpL;
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+ embeddings = build_ffn(embeddings,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ FFN_GELU,
+ -1);
+
+ if (use_window_attn) {
+ window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+ ggml_set_name(window_idx, "window_idx");
+ ggml_set_input(window_idx);
+
+ // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+ GGML_ASSERT(batch_size == 1);
+ embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
+ embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+ embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/qwen3vl.cpp b/llama.cpp/tools/mtmd/models/qwen3vl.cpp
new file mode 100644
index 0000000..5ecb10f
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/qwen3vl.cpp
@@ -0,0 +1,193 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_qwen3vl::build() {
+ GGML_ASSERT(model.patch_bias != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+ GGML_ASSERT(model.class_embedding == nullptr);
+
+ const int batch_size = 1;
+ const int n_pos = n_patches;
+ const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+ norm_type norm_t = NORM_TYPE_NORMAL;
+
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+ ggml_tensor * inp_raw = build_inp_raw();
+ ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+ GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+ GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+ // second conv dimension
+ {
+ auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+ inp = ggml_add(ctx0, inp, inp_1);
+
+ inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ inp = ggml_reshape_4d(
+ ctx0, inp,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+ inp = ggml_cont_3d(
+ ctx0, inp,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ // add patch bias
+ if (model.patch_bias != nullptr) {
+ inp = ggml_add(ctx0, inp, model.patch_bias);
+ cb(inp, "patch_bias", -1);
+ }
+
+ // calculate absolute position embedding and apply
+ ggml_tensor * learned_pos_embd = resize_position_embeddings();
+ learned_pos_embd = ggml_cont_4d(
+ ctx0, learned_pos_embd,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ learned_pos_embd = ggml_reshape_4d(
+ ctx0, learned_pos_embd,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+ learned_pos_embd = ggml_cont_3d(
+ ctx0, learned_pos_embd,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ inp = ggml_add(ctx0, inp, learned_pos_embd);
+ cb(inp, "inp_pos_emb", -1);
+
+ ggml_tensor * inpL = inp;
+
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ // pre-layernorm
+ if (model.pre_ln_w) {
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+ }
+
+ // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
+ ggml_tensor * deepstack_features = nullptr;
+ const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
+
+ // loop over layers
+ for (int il = 0; il < n_layer; il++) {
+ auto & layer = model.layers[il];
+
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+ // layernorm1
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+ cb(cur, "ln1", il);
+
+ // self-attention
+ {
+ cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+ cur = ggml_add(ctx0, cur, layer.qkv_b);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ 0);
+
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ ggml_row_size(cur->type, n_embd));
+
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ ggml_row_size(cur->type, 2 * n_embd));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // apply M-RoPE
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+ cb(Qcur, "Qcur_rope", il);
+ cb(Kcur, "Kcur_rope", il);
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ // re-add the layer input, e.g., residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ inpL = cur; // inpL = residual, cur = hidden_states
+
+ cb(cur, "ffn_inp", il);
+
+ // layernorm2
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+ cb(cur, "ffn_inp_normed", il);
+
+ // ffn
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ layer.ff_gate_w, layer.ff_gate_b,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ cb(cur, "ffn_out", il);
+
+ // residual 2
+ cur = ggml_add(ctx0, inpL, cur);
+ cb(cur, "layer_out", il);
+
+ if (layer.has_deepstack()) {
+ ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
+ feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
+ feat = build_ffn(feat,
+ layer.deepstack_fc1_w, layer.deepstack_fc1_b,
+ nullptr, nullptr,
+ layer.deepstack_fc2_w, layer.deepstack_fc2_b,
+ ffn_op_type::FFN_GELU, il);
+
+ if(!deepstack_features) {
+ deepstack_features = feat;
+ } else {
+ // concat along the feature dimension
+ deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
+ }
+ }
+
+ inpL = cur;
+ }
+
+ // post-layernorm
+ if (model.post_ln_w) {
+ inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+ }
+
+ // multimodal projection
+ ggml_tensor * embeddings = inpL;
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+
+ embeddings = build_ffn(embeddings,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ ffn_op_type::FFN_GELU, -1);
+
+ if (deepstack_features) {
+ embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0);
+ } // concat along the feature dimension
+
+ // build the graph
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/siglip.cpp b/llama.cpp/tools/mtmd/models/siglip.cpp
new file mode 100644
index 0000000..b866a11
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/siglip.cpp
@@ -0,0 +1,86 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_siglip::build() {
+ ggml_tensor * inp = build_inp();
+
+ ggml_tensor * learned_pos_embd = model.position_embeddings;
+ if (proj_type == PROJECTOR_TYPE_LFM2) {
+ learned_pos_embd = resize_position_embeddings();
+ }
+
+ ggml_tensor * cur = build_vit(
+ inp, n_patches,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ learned_pos_embd,
+ nullptr);
+
+ if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+ const int batch_size = 1;
+ GGML_ASSERT(n_patches_x == n_patches_y);
+ const int patches_per_image = n_patches_x;
+ const int kernel_size = hparams.n_merge;
+
+ cur = ggml_transpose(ctx0, cur);
+ cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
+
+ // doing a pool2d to reduce the number of output tokens
+ cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ // apply norm before projection
+ cur = ggml_rms_norm(ctx0, cur, eps);
+ cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+
+ // apply projection
+ cur = ggml_mul_mat(ctx0,
+ ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
+ cur);
+
+ } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
+ // pixel_shuffle
+ // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
+ const int scale_factor = model.hparams.n_merge;
+ cur = build_patch_merge_permute(cur, scale_factor);
+ cur = ggml_mul_mat(ctx0, model.projection, cur);
+
+ } else if (proj_type == PROJECTOR_TYPE_LFM2) {
+ // pixel unshuffle block
+ const int scale_factor = model.hparams.n_merge;
+ cur = build_patch_merge_permute(cur, scale_factor);
+
+ // projection, in LFM2-VL input norm is optional
+ if (model.mm_input_norm_w) {
+ cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+ cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+ }
+
+ if (model.mm_input_norm_b) {
+ cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+ }
+
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
+
+ } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
+ cur = build_ffn(cur,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ hparams.ffn_op,
+ -1);
+
+ } else {
+ GGML_ABORT("SigLIP: Unsupported projector type");
+ }
+
+ // build the graph
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/whisper-enc.cpp b/llama.cpp/tools/mtmd/models/whisper-enc.cpp
new file mode 100644
index 0000000..2f2b127
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/whisper-enc.cpp
@@ -0,0 +1,115 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_whisper_enc::build() {
+ const int n_frames = img.nx;
+ const int n_pos = n_frames / 2;
+ GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+ ggml_tensor * inp = build_inp_raw(1);
+
+ // conv1d block
+ {
+ // convolution + gelu
+ ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
+ cur = ggml_add(ctx0, cur, model.conv1d_1_b);
+
+ cur = ggml_gelu_erf(ctx0, cur);
+
+ cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
+ cur = ggml_add(ctx0, cur, model.conv1d_2_b);
+
+ cur = ggml_gelu_erf(ctx0, cur);
+ // transpose
+ inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+ cb(inp, "after_conv1d", -1);
+ }
+
+ // sanity check (only check one layer, but it should be the same for all)
+ GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
+ GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
+ GGML_ASSERT(model.layers[0].q_b);
+ GGML_ASSERT(model.layers[0].v_b);
+ GGML_ASSERT(!model.layers[0].k_b); // no bias for k
+
+ ggml_tensor * pos_embd_selected = ggml_view_2d(
+ ctx0, model.position_embeddings,
+ model.position_embeddings->ne[0], n_pos,
+ model.position_embeddings->nb[1], 0
+ );
+ ggml_tensor * cur = build_vit(
+ inp, n_pos,
+ NORM_TYPE_NORMAL,
+ hparams.ffn_op,
+ pos_embd_selected,
+ nullptr);
+
+ cb(cur, "after_transformer", -1);
+
+ if (model.audio_has_stack_frames()) {
+ // StackAudioFrames
+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
+ cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
+ cb(cur, "after_stacked", -1);
+ }
+
+ if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
+ // UltravoxProjector
+ // pre-norm
+ cur = ggml_rms_norm(ctx0, cur, 1e-6);
+ cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+
+ // ffn in
+ cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
+
+ // swiglu
+ // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
+ cur = ggml_swiglu_swapped(ctx0, cur);
+
+ // mid-norm
+ cur = ggml_rms_norm(ctx0, cur, 1e-6);
+ cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
+
+ // ffn out
+ cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+
+ } else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
+ // projector
+ cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
+ cur = ggml_add(ctx0, cur, model.mm_fc_b);
+
+ } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
+ // projector
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU_ERF,
+ -1);
+
+ } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
+ // projector
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU_ERF,
+ -1);
+
+ } else if (proj_type == PROJECTOR_TYPE_GLMA) {
+ cur = ggml_norm(ctx0, cur, hparams.eps);
+ cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+ cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
+ cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
+ cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
+ cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
+ cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
+ } else {
+ GGML_ABORT("%s: unknown projector type", __func__);
+ }
+
+ cb(cur, "projected", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/youtuvl.cpp b/llama.cpp/tools/mtmd/models/youtuvl.cpp
new file mode 100644
index 0000000..ffbf2be
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/youtuvl.cpp
@@ -0,0 +1,179 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_youtuvl::build() {
+ GGML_ASSERT(model.class_embedding == nullptr);
+ const int batch_size = 1;
+ const bool use_window_attn = !hparams.wa_layer_indexes.empty();
+ const int n_pos = n_patches;
+ const int num_position_ids = n_pos * 4;
+ const int m = 2;
+ const int Wp = n_patches_x;
+ const int Hp = n_patches_y;
+ const int Hm = Hp / m;
+ const int Wm = Wp / m;
+ norm_type norm_t = NORM_TYPE_NORMAL;
+
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+ ggml_tensor * inp = build_inp_raw();
+
+ // change conv3d to linear
+ // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
+ {
+ inp = ggml_reshape_4d(
+ ctx0, inp,
+ Wm * m * patch_size, m * patch_size, Hm, 3);
+ inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ m * patch_size * 3, Wm, m * patch_size, Hm);
+
+ inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ m * patch_size * 3, patch_size, m, Hm * Wm);
+
+ inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ patch_size, 3, patch_size, Hm * Wm * m * m);
+
+ inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
+ inp = ggml_cont_3d(
+ ctx0, inp,
+ 3*patch_size* patch_size, Hm * Wm * m * m, 1);
+ }
+ inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+
+ if (model.patch_bias) {
+ inp = ggml_add(ctx0, inp, model.patch_bias);
+ }
+
+ inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+
+ ggml_tensor * inpL = inp;
+ ggml_tensor * window_mask = nullptr;
+ ggml_tensor * window_idx = nullptr;
+ ggml_tensor * inv_window_idx = nullptr;
+
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ // pre-layernorm
+ if (model.pre_ln_w) {
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+ }
+ if (use_window_attn) {
+ inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+ ggml_set_name(inv_window_idx, "inv_window_idx");
+ ggml_set_input(inv_window_idx);
+ // mask for window attention
+ window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
+ ggml_set_name(window_mask, "window_mask");
+ ggml_set_input(window_mask);
+
+ // if flash attn is used, we need to pad the mask and cast to f16
+ if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+ window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+ }
+
+ // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+ GGML_ASSERT(batch_size == 1);
+ inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
+ inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ // loop over layers
+ for (int il = 0; il < n_layer; il++) {
+ const auto & layer = model.layers[il];
+ const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
+
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+ // layernorm1
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+ // self-attention
+ {
+ ggml_tensor * Qcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
+ ggml_tensor * Kcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
+ ggml_tensor * Vcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
+
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+ ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+ }
+ // re-add the layer input, e.g., residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ inpL = cur; // inpL = residual, cur = hidden_states
+
+ // layernorm2
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+
+ // ffn
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ nullptr, nullptr,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ // residual 2
+ cur = ggml_add(ctx0, inpL, cur);
+
+ inpL = cur;
+ }
+
+ ggml_tensor * embeddings = inpL;
+ if (use_window_attn) {
+ const int spatial_merge_unit = 4;
+ window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
+ ggml_set_name(window_idx, "window_idx");
+ ggml_set_input(window_idx);
+ GGML_ASSERT(batch_size == 1);
+ embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
+ embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
+ cb(embeddings, "window_order_restored", -1);
+ }
+
+ // post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
+ if (model.post_ln_w) {
+ embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+ }
+
+ // Now apply merger (VLPatchMerger):
+ // 1. Apply RMS norm (ln_q in VLPatchMerger)
+ embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
+ cb(embeddings, "merger_normed", -1);
+
+ // 2. First reshape for spatial merge (merge 2x2 patches)
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+ cb(embeddings, "merger_reshaped", -1);
+
+ embeddings = build_ffn(embeddings,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ FFN_GELU,
+ -1);
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}