1#include "models.h"
  2
  3ggml_cgraph * clip_graph_minicpmv::build() {
  4    GGML_ASSERT(model.class_embedding == nullptr);
  5    const int n_pos       = n_patches;
  6    const int n_embd_proj = n_mmproj_embd;
  7
  8    // position embeddings for the projector (not for ViT)
  9    // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
 10    // base frequency omega
 11    ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
 12    ggml_set_name(omega, "omega");
 13    ggml_set_input(omega);
 14
 15    // 2D input positions (using float for sinusoidal embeddings)
 16    ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
 17    ggml_set_name(pos_h, "pos_h");
 18    ggml_set_input(pos_h);
 19    ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
 20    ggml_set_name(pos_w, "pos_w");
 21    ggml_set_input(pos_w);
 22
 23    // for selecting learned pos embd, used by ViT
 24    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
 25    ggml_set_name(positions, "positions");
 26    ggml_set_input(positions);
 27
 28    ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
 29
 30    ggml_tensor * inp = build_inp();
 31    ggml_tensor * embeddings = build_vit(
 32                            inp, n_pos,
 33                            NORM_TYPE_NORMAL,
 34                            hparams.ffn_op,
 35                            learned_pos_embd,
 36                            nullptr);
 37
 38    // resampler projector (it is just another transformer)
 39
 40    ggml_tensor * q = model.mm_model_query;
 41    ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
 42
 43    // norm
 44    q = build_norm(q, model.mm_model_ln_q_w,  model.mm_model_ln_q_b,  NORM_TYPE_NORMAL, eps, -1);
 45    v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
 46
 47    // calculate sinusoidal pos embd
 48    ggml_tensor * pos_embed = nullptr;
 49    {
 50        // outer product
 51        ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
 52        ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
 53        ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
 54        // sin and cos
 55        ggml_tensor * pos_embd_x = ggml_concat(
 56            ctx0,
 57            ggml_sin(ctx0, theta_x),
 58            ggml_cos(ctx0, theta_x),
 59            0 // concat on first dim
 60        );
 61        ggml_tensor * pos_embd_y = ggml_concat(
 62            ctx0,
 63            ggml_sin(ctx0, theta_y),
 64            ggml_cos(ctx0, theta_y),
 65            0 // concat on first dim
 66        );
 67        pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
 68    }
 69
 70    // k = v + pos_embed
 71    ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
 72
 73    // attention
 74    {
 75        const int d_head = 128;
 76        int n_head = n_embd_proj/d_head;
 77        // Use actual config value if available, otherwise fall back to hardcoded values
 78        int num_query = hparams.minicpmv_query_num;
 79        ggml_tensor * Q = ggml_add(ctx0,
 80            ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
 81            model.mm_model_attn_q_b);
 82        ggml_tensor * K = ggml_add(ctx0,
 83            ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
 84            model.mm_model_attn_k_b);
 85        ggml_tensor * V = ggml_add(ctx0,
 86            ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
 87            model.mm_model_attn_v_b);
 88
 89        Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
 90        K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
 91        V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
 92
 93        cb(Q, "resampler_Q", -1);
 94        cb(K, "resampler_K", -1);
 95        cb(V, "resampler_V", -1);
 96
 97        float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
 98        embeddings = build_attn(
 99            model.mm_model_attn_o_w,
100            model.mm_model_attn_o_b,
101            Q, K, V, nullptr, resampler_kq_scale, -1);
102        cb(embeddings, "resampler_attn_out", -1);
103    }
104    // layernorm
105    embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
106
107    // projection
108    embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
109
110    // build the graph
111    ggml_build_forward_expand(gf, embeddings);
112
113    return gf;
114}