1#include "models.h"
2
3// this graph is used by llava, granite and glm
4// due to having embedding_stack (used by granite), we cannot reuse build_vit
5ggml_cgraph * clip_graph_llava::build() {
6 const int batch_size = 1;
7 const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
8
9 GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
10
11 // Calculate the deepest feature layer based on hparams and projector type
12 int max_feature_layer = n_layer;
13 {
14 // Get the index of the second to last layer; this is the default for models that have a llava projector
15 int il_last = hparams.n_layer - 1;
16 int deepest_feature_layer = -1;
17
18 if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
19 il_last += 1;
20 }
21
22 // If we set explicit vision feature layers, only go up to the deepest one
23 // NOTE: only used by granite-vision models for now
24 for (const auto & feature_layer : hparams.vision_feature_layer) {
25 if (feature_layer > deepest_feature_layer) {
26 deepest_feature_layer = feature_layer;
27 }
28 }
29 max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
30 }
31
32 ggml_tensor * inp = build_inp();
33
34 // concat class_embeddings and patch_embeddings
35 if (model.class_embedding) {
36 inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
37 }
38
39 ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
40 ggml_set_name(positions, "positions");
41 ggml_set_input(positions);
42
43 inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
44
45 ggml_tensor * inpL = inp;
46
47 // pre-layernorm
48 if (model.pre_ln_w) {
49 inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
50 cb(inpL, "pre_ln", -1);
51 }
52
53 std::vector<ggml_tensor *> embedding_stack;
54 const auto & vision_feature_layer = hparams.vision_feature_layer;
55
56 // loop over layers
57 for (int il = 0; il < max_feature_layer; il++) {
58 auto & layer = model.layers[il];
59 ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
60
61 // If this is an embedding feature layer, save the output.
62 // NOTE: 0 index here refers to the input to the encoder.
63 if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
64 embedding_stack.push_back(cur);
65 }
66
67 // layernorm1
68 cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
69 cb(cur, "layer_inp_normed", il);
70
71 // self-attention
72 {
73 ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
74 if (layer.q_b) {
75 Qcur = ggml_add(ctx0, Qcur, layer.q_b);
76 }
77
78 ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
79 if (layer.k_b) {
80 Kcur = ggml_add(ctx0, Kcur, layer.k_b);
81 }
82
83 ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
84 if (layer.v_b) {
85 Vcur = ggml_add(ctx0, Vcur, layer.v_b);
86 }
87
88 Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
89 Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
90 Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
91
92 cb(Qcur, "Qcur", il);
93 cb(Kcur, "Kcur", il);
94 cb(Vcur, "Vcur", il);
95
96 cur = build_attn(layer.o_w, layer.o_b,
97 Qcur, Kcur, Vcur, nullptr, kq_scale, il);
98 cb(cur, "attn_out", il);
99 }
100
101 // re-add the layer input, e.g., residual
102 cur = ggml_add(ctx0, cur, inpL);
103
104 inpL = cur; // inpL = residual, cur = hidden_states
105
106 cb(cur, "ffn_inp", il);
107
108 // layernorm2
109 cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
110 cb(cur, "ffn_inp_normed", il);
111
112 // ffn
113 cur = build_ffn(cur,
114 layer.ff_up_w, layer.ff_up_b,
115 layer.ff_gate_w, layer.ff_gate_b,
116 layer.ff_down_w, layer.ff_down_b,
117 hparams.ffn_op, il);
118
119 cb(cur, "ffn_out", il);
120
121 // residual 2
122 cur = ggml_add(ctx0, inpL, cur);
123 cb(cur, "layer_out", il);
124
125 inpL = cur;
126 }
127
128 // post-layernorm
129 if (model.post_ln_w) {
130 inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
131 }
132
133 ggml_tensor * embeddings = inpL;
134
135 // process vision feature layers (used by granite)
136 {
137 // final layer is a vision feature layer
138 if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
139 embedding_stack.push_back(inpL);
140 }
141
142 // If feature layers are explicitly set, stack them (if we have multiple)
143 if (!embedding_stack.empty()) {
144 embeddings = embedding_stack[0];
145 for (size_t i = 1; i < embedding_stack.size(); i++) {
146 embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
147 }
148 }
149 }
150
151 // llava projector (also used by granite)
152 if (hparams.has_llava_projector) {
153 embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
154
155 ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
156 ggml_set_name(patches, "patches");
157 ggml_set_input(patches);
158
159 // shape [1, 576, 1024]
160 // ne is whcn, ne = [1024, 576, 1, 1]
161 embeddings = ggml_get_rows(ctx0, embeddings, patches);
162
163 // print_tensor_info(embeddings, "embeddings");
164
165 // llava projector
166 if (proj_type == PROJECTOR_TYPE_MLP) {
167 embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
168 embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
169
170 embeddings = ggml_gelu(ctx0, embeddings);
171 if (model.mm_2_w) {
172 embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
173 embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
174 }
175 }
176 else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
177 embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
178 embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
179 // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
180 // First LayerNorm
181 embeddings = ggml_norm(ctx0, embeddings, eps);
182 embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
183 model.mm_1_b);
184
185 // GELU activation
186 embeddings = ggml_gelu(ctx0, embeddings);
187
188 // Second linear layer
189 embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
190 embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
191
192 // Second LayerNorm
193 embeddings = ggml_norm(ctx0, embeddings, eps);
194 embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
195 model.mm_4_b);
196 }
197 else if (proj_type == PROJECTOR_TYPE_LDP) {
198 // MobileVLM projector
199 int n_patch = 24;
200 ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
201 mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
202 mlp_1 = ggml_gelu(ctx0, mlp_1);
203 ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
204 mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
205 // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
206
207 // block 1
208 ggml_tensor * block_1 = nullptr;
209 {
210 // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
211 mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
212 mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
213 // stride = 1, padding = 1, bias is nullptr
214 block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
215
216 // layer norm
217 // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
218 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
219 // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
220 block_1 = ggml_norm(ctx0, block_1, eps);
221 block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
222 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
223
224 // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
225 // hardswish
226 ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
227
228 block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
229 // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
230 // pointwise conv
231 block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
232 block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
233 block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
234 block_1 = ggml_relu(ctx0, block_1);
235 block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
236 block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
237 block_1 = ggml_hardsigmoid(ctx0, block_1);
238 // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
239 block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
240 block_1 = ggml_mul(ctx0, block_1_hw, block_1);
241
242 int w = block_1->ne[0], h = block_1->ne[1];
243 block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
244 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
245
246 // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
247 block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
248 block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
249
250 // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
251 block_1 = ggml_norm(ctx0, block_1, eps);
252 block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
253 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
254 // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
255 // residual
256 block_1 = ggml_add(ctx0, mlp_3, block_1);
257 }
258
259 // block_2
260 {
261 // stride = 2
262 block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
263
264 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
265 // layer norm
266 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
267 // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
268 block_1 = ggml_norm(ctx0, block_1, eps);
269 block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
270 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
271 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
272 // hardswish
273 ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
274
275 // not sure the parameters is right for globalAvgPooling
276 block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
277 // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
278 // pointwise conv
279 block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
280 block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
281 block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
282 block_1 = ggml_relu(ctx0, block_1);
283 block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
284 block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
285 block_1 = ggml_hardsigmoid(ctx0, block_1);
286
287 // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
288 block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
289 block_1 = ggml_mul(ctx0, block_1_hw, block_1);
290
291 int w = block_1->ne[0], h = block_1->ne[1];
292 block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
293 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
294 // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
295 block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
296 block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
297
298
299 // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
300 block_1 = ggml_norm(ctx0, block_1, eps);
301 block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
302 block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
303 // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
304 }
305 embeddings = block_1;
306 }
307 else if (proj_type == PROJECTOR_TYPE_LDPV2)
308 {
309 int n_patch = 24;
310 ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
311 mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
312 mlp_0 = ggml_gelu(ctx0, mlp_0);
313 ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
314 mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
315 // mlp_2 ne = [2048, 576, 1, 1]
316 // // AVG Pool Layer 2*2, strides = 2
317 mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
318 // mlp_2 ne = [576, 2048, 1, 1]
319 mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
320 // mlp_2 ne [24, 24, 2048, 1]
321 mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
322 // weight ne = [3, 3, 2048, 1]
323 ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
324 peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
325 peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
326 mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
327 peg_0 = ggml_add(ctx0, peg_0, mlp_2);
328 peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
329 embeddings = peg_0;
330 }
331 else {
332 GGML_ABORT("fatal error");
333 }
334 }
335
336 // glm projector
337 else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
338 size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
339 embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
340 embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
341 embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
342 embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
343 embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
344 embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
345 // GLU
346 {
347 embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
348 embeddings = ggml_norm(ctx0, embeddings, eps);
349 embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
350 embeddings = ggml_gelu_inplace(ctx0, embeddings);
351 ggml_tensor * x = embeddings;
352 embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
353 x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
354 embeddings = ggml_swiglu_split(ctx0, embeddings, x);
355 embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
356 }
357 // arrangement of BOI/EOI token embeddings
358 // note: these embeddings are not present in text model, hence we cannot process them as text tokens
359 // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
360 {
361 embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
362 embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
363 }
364 }
365
366 else {
367 GGML_ABORT("llava: unknown projector type");
368 }
369
370 // build the graph
371 ggml_build_forward_expand(gf, embeddings);
372
373 return gf;
374}