1#include "models.h"
2
3ggml_cgraph * clip_graph_minicpmv::build() {
4 GGML_ASSERT(model.class_embedding == nullptr);
5 const int n_pos = n_patches;
6 const int n_embd_proj = n_mmproj_embd;
7
8 // position embeddings for the projector (not for ViT)
9 // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
10 // base frequency omega
11 ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
12 ggml_set_name(omega, "omega");
13 ggml_set_input(omega);
14
15 // 2D input positions (using float for sinusoidal embeddings)
16 ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
17 ggml_set_name(pos_h, "pos_h");
18 ggml_set_input(pos_h);
19 ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
20 ggml_set_name(pos_w, "pos_w");
21 ggml_set_input(pos_w);
22
23 // for selecting learned pos embd, used by ViT
24 struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
25 ggml_set_name(positions, "positions");
26 ggml_set_input(positions);
27
28 ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
29
30 ggml_tensor * inp = build_inp();
31 ggml_tensor * embeddings = build_vit(
32 inp, n_pos,
33 NORM_TYPE_NORMAL,
34 hparams.ffn_op,
35 learned_pos_embd,
36 nullptr);
37
38 // resampler projector (it is just another transformer)
39
40 ggml_tensor * q = model.mm_model_query;
41 ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
42
43 // norm
44 q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
45 v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
46
47 // calculate sinusoidal pos embd
48 ggml_tensor * pos_embed = nullptr;
49 {
50 // outer product
51 ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
52 ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
53 ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
54 // sin and cos
55 ggml_tensor * pos_embd_x = ggml_concat(
56 ctx0,
57 ggml_sin(ctx0, theta_x),
58 ggml_cos(ctx0, theta_x),
59 0 // concat on first dim
60 );
61 ggml_tensor * pos_embd_y = ggml_concat(
62 ctx0,
63 ggml_sin(ctx0, theta_y),
64 ggml_cos(ctx0, theta_y),
65 0 // concat on first dim
66 );
67 pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
68 }
69
70 // k = v + pos_embed
71 ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
72
73 // attention
74 {
75 const int d_head = 128;
76 int n_head = n_embd_proj/d_head;
77 // Use actual config value if available, otherwise fall back to hardcoded values
78 int num_query = hparams.minicpmv_query_num;
79 ggml_tensor * Q = ggml_add(ctx0,
80 ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
81 model.mm_model_attn_q_b);
82 ggml_tensor * K = ggml_add(ctx0,
83 ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
84 model.mm_model_attn_k_b);
85 ggml_tensor * V = ggml_add(ctx0,
86 ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
87 model.mm_model_attn_v_b);
88
89 Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
90 K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
91 V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
92
93 cb(Q, "resampler_Q", -1);
94 cb(K, "resampler_K", -1);
95 cb(V, "resampler_V", -1);
96
97 float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
98 embeddings = build_attn(
99 model.mm_model_attn_o_w,
100 model.mm_model_attn_o_b,
101 Q, K, V, nullptr, resampler_kq_scale, -1);
102 cb(embeddings, "resampler_attn_out", -1);
103 }
104 // layernorm
105 embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
106
107 // projection
108 embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
109
110 // build the graph
111 ggml_build_forward_expand(gf, embeddings);
112
113 return gf;
114}