llmnpc - llama.cpp/tools/mtmd/models/llava.cpp

Path: llmnpc / llama.cpp / tools / mtmd / models / llava.cpp (raw)
  1#include "models.h"
  2
  3// this graph is used by llava, granite and glm
  4// due to having embedding_stack (used by granite), we cannot reuse build_vit
  5ggml_cgraph * clip_graph_llava::build() {
  6    const int batch_size = 1;
  7    const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
  8
  9    GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
 10
 11    // Calculate the deepest feature layer based on hparams and projector type
 12    int max_feature_layer = n_layer;
 13    {
 14        // Get the index of the second to last layer; this is the default for models that have a llava projector
 15        int il_last = hparams.n_layer - 1;
 16        int deepest_feature_layer = -1;
 17
 18        if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
 19            il_last += 1;
 20        }
 21
 22        // If we set explicit vision feature layers, only go up to the deepest one
 23        // NOTE: only used by granite-vision models for now
 24        for (const auto & feature_layer : hparams.vision_feature_layer) {
 25            if (feature_layer > deepest_feature_layer) {
 26                deepest_feature_layer = feature_layer;
 27            }
 28        }
 29        max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
 30    }
 31
 32    ggml_tensor * inp = build_inp();
 33
 34    // concat class_embeddings and patch_embeddings
 35    if (model.class_embedding) {
 36        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
 37    }
 38
 39    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
 40    ggml_set_name(positions, "positions");
 41    ggml_set_input(positions);
 42
 43    inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
 44
 45    ggml_tensor * inpL = inp;
 46
 47    // pre-layernorm
 48    if (model.pre_ln_w) {
 49        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
 50        cb(inpL, "pre_ln", -1);
 51    }
 52
 53    std::vector<ggml_tensor *> embedding_stack;
 54    const auto & vision_feature_layer = hparams.vision_feature_layer;
 55
 56    // loop over layers
 57    for (int il = 0; il < max_feature_layer; il++) {
 58        auto & layer = model.layers[il];
 59        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
 60
 61        // If this is an embedding feature layer, save the output.
 62        // NOTE: 0 index here refers to the input to the encoder.
 63        if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
 64            embedding_stack.push_back(cur);
 65        }
 66
 67        // layernorm1
 68        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
 69        cb(cur, "layer_inp_normed", il);
 70
 71        // self-attention
 72        {
 73            ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
 74            if (layer.q_b) {
 75                Qcur = ggml_add(ctx0, Qcur, layer.q_b);
 76            }
 77
 78            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
 79            if (layer.k_b) {
 80                Kcur = ggml_add(ctx0, Kcur, layer.k_b);
 81            }
 82
 83            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
 84            if (layer.v_b) {
 85                Vcur = ggml_add(ctx0, Vcur, layer.v_b);
 86            }
 87
 88            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
 89            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
 90            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
 91
 92            cb(Qcur, "Qcur", il);
 93            cb(Kcur, "Kcur", il);
 94            cb(Vcur, "Vcur", il);
 95
 96            cur = build_attn(layer.o_w, layer.o_b,
 97                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
 98            cb(cur, "attn_out", il);
 99        }
100
101        // re-add the layer input, e.g., residual
102        cur = ggml_add(ctx0, cur, inpL);
103
104        inpL = cur; // inpL = residual, cur = hidden_states
105
106        cb(cur, "ffn_inp", il);
107
108        // layernorm2
109        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
110        cb(cur, "ffn_inp_normed", il);
111
112        // ffn
113        cur = build_ffn(cur,
114            layer.ff_up_w, layer.ff_up_b,
115            layer.ff_gate_w, layer.ff_gate_b,
116            layer.ff_down_w, layer.ff_down_b,
117            hparams.ffn_op, il);
118
119        cb(cur, "ffn_out", il);
120
121        // residual 2
122        cur = ggml_add(ctx0, inpL, cur);
123        cb(cur, "layer_out", il);
124
125        inpL = cur;
126    }
127
128    // post-layernorm
129    if (model.post_ln_w) {
130        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
131    }
132
133    ggml_tensor * embeddings = inpL;
134
135    // process vision feature layers (used by granite)
136    {
137        // final layer is a vision feature layer
138        if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
139            embedding_stack.push_back(inpL);
140        }
141
142        // If feature layers are explicitly set, stack them (if we have multiple)
143        if (!embedding_stack.empty()) {
144            embeddings = embedding_stack[0];
145            for (size_t i = 1; i < embedding_stack.size(); i++) {
146                embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
147            }
148        }
149    }
150
151    // llava projector (also used by granite)
152    if (hparams.has_llava_projector) {
153        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
154
155        ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
156        ggml_set_name(patches, "patches");
157        ggml_set_input(patches);
158
159        // shape [1, 576, 1024]
160        // ne is whcn, ne = [1024, 576, 1, 1]
161        embeddings = ggml_get_rows(ctx0, embeddings, patches);
162
163        // print_tensor_info(embeddings, "embeddings");
164
165        // llava projector
166        if (proj_type == PROJECTOR_TYPE_MLP) {
167            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
168            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
169
170            embeddings = ggml_gelu(ctx0, embeddings);
171            if (model.mm_2_w) {
172                embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
173                embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
174            }
175        }
176        else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
177            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
178            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
179            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
180            // First LayerNorm
181            embeddings = ggml_norm(ctx0, embeddings, eps);
182            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
183                                model.mm_1_b);
184
185            // GELU activation
186            embeddings = ggml_gelu(ctx0, embeddings);
187
188            // Second linear layer
189            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
190            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
191
192            // Second LayerNorm
193            embeddings = ggml_norm(ctx0, embeddings, eps);
194            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
195                                model.mm_4_b);
196        }
197        else if (proj_type == PROJECTOR_TYPE_LDP) {
198            // MobileVLM projector
199            int n_patch = 24;
200            ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
201            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
202            mlp_1 = ggml_gelu(ctx0, mlp_1);
203            ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
204            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
205            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
206
207            // block 1
208            ggml_tensor * block_1 = nullptr;
209            {
210                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
211                mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
212                mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
213                // stride = 1, padding = 1, bias is nullptr
214                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
215
216                // layer norm
217                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
218                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
219                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
220                block_1 = ggml_norm(ctx0, block_1, eps);
221                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
222                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
223
224                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
225                // hardswish
226                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
227
228                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
229                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
230                // pointwise conv
231                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
232                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
233                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
234                block_1 = ggml_relu(ctx0, block_1);
235                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
236                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
237                block_1 = ggml_hardsigmoid(ctx0, block_1);
238                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
239                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
240                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
241
242                int w = block_1->ne[0], h = block_1->ne[1];
243                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
244                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
245
246                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
247                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
248                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
249
250                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
251                block_1 = ggml_norm(ctx0, block_1, eps);
252                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
253                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
254                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
255                // residual
256                block_1 = ggml_add(ctx0, mlp_3, block_1);
257            }
258
259            // block_2
260            {
261                // stride = 2
262                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
263
264                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
265                // layer norm
266                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
267                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
268                block_1 = ggml_norm(ctx0, block_1, eps);
269                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
270                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
271                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
272                // hardswish
273                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
274
275                // not sure the parameters is right for globalAvgPooling
276                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
277                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
278                // pointwise conv
279                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
280                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
281                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
282                block_1 = ggml_relu(ctx0, block_1);
283                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
284                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
285                block_1 = ggml_hardsigmoid(ctx0, block_1);
286
287                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
288                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
289                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
290
291                int w = block_1->ne[0], h = block_1->ne[1];
292                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
293                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
294                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
295                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
296                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
297
298
299                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
300                block_1 = ggml_norm(ctx0, block_1, eps);
301                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
302                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
303                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
304            }
305            embeddings = block_1;
306        }
307        else if (proj_type == PROJECTOR_TYPE_LDPV2)
308        {
309            int n_patch = 24;
310            ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
311            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
312            mlp_0 = ggml_gelu(ctx0, mlp_0);
313            ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
314            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
315            // mlp_2 ne = [2048, 576, 1, 1]
316            // // AVG Pool Layer 2*2, strides = 2
317            mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
318            // mlp_2 ne = [576, 2048, 1, 1]
319            mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
320            // mlp_2 ne [24, 24, 2048, 1]
321            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
322            // weight ne = [3, 3, 2048, 1]
323            ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
324            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
325            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
326            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
327            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
328            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
329            embeddings = peg_0;
330        }
331        else {
332            GGML_ABORT("fatal error");
333        }
334    }
335
336    // glm projector
337    else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
338        size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
339        embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
340        embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
341        embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
342        embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
343        embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
344        embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
345        // GLU
346        {
347            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
348            embeddings = ggml_norm(ctx0, embeddings, eps);
349            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
350            embeddings = ggml_gelu_inplace(ctx0, embeddings);
351            ggml_tensor * x = embeddings;
352            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
353            x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
354            embeddings = ggml_swiglu_split(ctx0, embeddings, x);
355            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
356        }
357        // arrangement of BOI/EOI token embeddings
358        // note: these embeddings are not present in text model, hence we cannot process them as text tokens
359        // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
360        {
361            embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
362            embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
363        }
364    }
365
366    else {
367        GGML_ABORT("llava: unknown projector type");
368    }
369
370    // build the graph
371    ggml_build_forward_expand(gf, embeddings);
372
373    return gf;
374}