1#include "models.h"
 2
 3ggml_cgraph * clip_graph_siglip::build() {
 4    ggml_tensor * inp = build_inp();
 5
 6    ggml_tensor * learned_pos_embd = model.position_embeddings;
 7    if (proj_type == PROJECTOR_TYPE_LFM2) {
 8        learned_pos_embd = resize_position_embeddings();
 9    }
10
11    ggml_tensor * cur = build_vit(
12                            inp, n_patches,
13                            NORM_TYPE_NORMAL,
14                            hparams.ffn_op,
15                            learned_pos_embd,
16                            nullptr);
17
18    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
19        const int batch_size = 1;
20        GGML_ASSERT(n_patches_x == n_patches_y);
21        const int patches_per_image = n_patches_x;
22        const int kernel_size = hparams.n_merge;
23
24        cur = ggml_transpose(ctx0, cur);
25        cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
26
27        // doing a pool2d to reduce the number of output tokens
28        cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
29        cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
30        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
31
32        // apply norm before projection
33        cur = ggml_rms_norm(ctx0, cur, eps);
34        cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
35
36        // apply projection
37        cur = ggml_mul_mat(ctx0,
38            ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
39            cur);
40
41    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
42        // pixel_shuffle
43        // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
44        const int scale_factor = model.hparams.n_merge;
45        cur = build_patch_merge_permute(cur, scale_factor);
46        cur = ggml_mul_mat(ctx0, model.projection, cur);
47
48    } else if (proj_type == PROJECTOR_TYPE_LFM2) {
49        // pixel unshuffle block
50        const int scale_factor = model.hparams.n_merge;
51        cur = build_patch_merge_permute(cur, scale_factor);
52
53        // projection, in LFM2-VL input norm is optional
54        if (model.mm_input_norm_w) {
55            cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
56            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
57        }
58
59        if (model.mm_input_norm_b) {
60            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
61        }
62
63        cur = build_ffn(cur,
64            model.mm_1_w, model.mm_1_b,
65            nullptr, nullptr,
66            model.mm_2_w, model.mm_2_b,
67            FFN_GELU,
68            -1);
69
70    } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
71        cur = build_ffn(cur,
72            model.mm_0_w, model.mm_0_b,
73            nullptr, nullptr,
74            model.mm_1_w, model.mm_1_b,
75            hparams.ffn_op,
76            -1);
77
78    } else {
79        GGML_ABORT("SigLIP: Unsupported projector type");
80    }
81
82    // build the graph
83    ggml_build_forward_expand(gf, cur);
84
85    return gf;
86}