1#include "models.h"
2
3ggml_cgraph * clip_graph_siglip::build() {
4 ggml_tensor * inp = build_inp();
5
6 ggml_tensor * learned_pos_embd = model.position_embeddings;
7 if (proj_type == PROJECTOR_TYPE_LFM2) {
8 learned_pos_embd = resize_position_embeddings();
9 }
10
11 ggml_tensor * cur = build_vit(
12 inp, n_patches,
13 NORM_TYPE_NORMAL,
14 hparams.ffn_op,
15 learned_pos_embd,
16 nullptr);
17
18 if (proj_type == PROJECTOR_TYPE_GEMMA3) {
19 const int batch_size = 1;
20 GGML_ASSERT(n_patches_x == n_patches_y);
21 const int patches_per_image = n_patches_x;
22 const int kernel_size = hparams.n_merge;
23
24 cur = ggml_transpose(ctx0, cur);
25 cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
26
27 // doing a pool2d to reduce the number of output tokens
28 cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
29 cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
30 cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
31
32 // apply norm before projection
33 cur = ggml_rms_norm(ctx0, cur, eps);
34 cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
35
36 // apply projection
37 cur = ggml_mul_mat(ctx0,
38 ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
39 cur);
40
41 } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
42 // pixel_shuffle
43 // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
44 const int scale_factor = model.hparams.n_merge;
45 cur = build_patch_merge_permute(cur, scale_factor);
46 cur = ggml_mul_mat(ctx0, model.projection, cur);
47
48 } else if (proj_type == PROJECTOR_TYPE_LFM2) {
49 // pixel unshuffle block
50 const int scale_factor = model.hparams.n_merge;
51 cur = build_patch_merge_permute(cur, scale_factor);
52
53 // projection, in LFM2-VL input norm is optional
54 if (model.mm_input_norm_w) {
55 cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
56 cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
57 }
58
59 if (model.mm_input_norm_b) {
60 cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
61 }
62
63 cur = build_ffn(cur,
64 model.mm_1_w, model.mm_1_b,
65 nullptr, nullptr,
66 model.mm_2_w, model.mm_2_b,
67 FFN_GELU,
68 -1);
69
70 } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
71 cur = build_ffn(cur,
72 model.mm_0_w, model.mm_0_b,
73 nullptr, nullptr,
74 model.mm_1_w, model.mm_1_b,
75 hparams.ffn_op,
76 -1);
77
78 } else {
79 GGML_ABORT("SigLIP: Unsupported projector type");
80 }
81
82 // build the graph
83 ggml_build_forward_expand(gf, cur);
84
85 return gf;
86}