1#include "models.h"
  2
  3llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  4    //TODO: if the model varies, these parameters need to be read from the model
  5    const int64_t n_embd_base = 256;
  6    const float scale_embd  = 12.0f;
  7    const float scale_depth = 1.4f;
  8    const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
  9
 10    const uint32_t n_embd_head_qk_rope = hparams.n_rot;
 11    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
 12
 13    const uint32_t kv_lora_rank = hparams.n_lora_kv;
 14
 15    ggml_tensor * cur;
 16    ggml_tensor * inpL;
 17
 18    inpL = build_inp_embd(model.tok_embd);
 19
 20    // scale the input embeddings
 21    inpL = ggml_scale(ctx0, inpL, scale_embd);
 22    cb(inpL, "inp_scaled", -1);
 23
 24    // inp_pos - contains the positions
 25    ggml_tensor * inp_pos = build_inp_pos();
 26
 27    auto * inp_attn = build_attn_inp_kv();
 28
 29    ggml_tensor * inp_out_ids = build_inp_out_ids();
 30
 31    for (int il = 0; il < n_layer; ++il) {
 32        ggml_tensor * inpSA = inpL;
 33
 34        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
 35
 36        // norm
 37        cur = build_norm(inpL,
 38                model.layers[il].attn_norm, NULL,
 39                LLM_NORM_RMS, il);
 40        cb(cur, "attn_norm", il);
 41
 42        // self_attention
 43        {
 44            ggml_tensor * q = NULL;
 45            // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
 46            q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
 47            cb(q, "q", il);
 48
 49            q = build_norm(q,
 50                    model.layers[il].attn_q_a_norm, NULL,
 51                    LLM_NORM_RMS, il);
 52            cb(q, "q", il);
 53
 54            // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
 55            q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
 56            cb(q, "q", il);
 57
 58            // split into {n_head * n_embd_head_qk_nope, n_tokens}
 59            ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
 60                    ggml_row_size(q->type, hparams.n_embd_head_k),
 61                    ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
 62                    0);
 63            cb(q_nope, "q_nope", il);
 64
 65            // and {n_head * n_embd_head_qk_rope, n_tokens}
 66            ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
 67                    ggml_row_size(q->type, hparams.n_embd_head_k),
 68                    ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
 69                    ggml_row_size(q->type, n_embd_head_qk_nope));
 70            cb(q_pe, "q_pe", il);
 71
 72            // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
 73            ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
 74            cb(kv_pe_compresseed, "kv_pe_compresseed", il);
 75
 76            // split into {kv_lora_rank, n_tokens}
 77            ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
 78                    kv_pe_compresseed->nb[1],
 79                    0);
 80            cb(kv_compressed, "kv_compressed", il);
 81
 82            // and {n_embd_head_qk_rope, n_tokens}
 83            ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
 84                    kv_pe_compresseed->nb[1],
 85                    kv_pe_compresseed->nb[1],
 86                    ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
 87            cb(k_pe, "k_pe", il);
 88
 89            kv_compressed = build_norm(kv_compressed,
 90                    model.layers[il].attn_kv_a_norm, NULL,
 91                    LLM_NORM_RMS, il);
 92            cb(kv_compressed, "kv_compressed", il);
 93
 94            // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
 95            ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
 96            cb(kv, "kv", il);
 97
 98            // split into {n_head * n_embd_head_qk_nope, n_tokens}
 99            ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
100                    ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
101                    ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
102                    0);
103            cb(k_nope, "k_nope", il);
104
105            // and {n_head * n_embd_head_v, n_tokens}
106            ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
107                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
108                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
109                    ggml_row_size(kv->type, (n_embd_head_qk_nope)));
110            cb(v_states, "v_states", il);
111
112            v_states = ggml_cont(ctx0, v_states);
113            cb(v_states, "v_states", il);
114
115            q_pe = ggml_rope_ext(
116                    ctx0, q_pe, inp_pos, rope_factors,
117                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
118                    ext_factor, attn_factor, beta_fast, beta_slow
119                    );
120            cb(q_pe, "q_pe", il);
121
122            // shared RoPE key
123            k_pe = ggml_rope_ext(
124                    ctx0, k_pe, inp_pos, rope_factors,
125                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
126                    ext_factor, attn_factor, beta_fast, beta_slow
127                    );
128            cb(k_pe, "k_pe", il);
129
130            ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
131            cb(q_states, "q_states", il);
132
133            ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
134            cb(k_states, "k_states", il);
135
136            cur = build_attn(inp_attn,
137                    model.layers[il].wo, NULL,
138                    q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
139        }
140        if (il == n_layer - 1 && inp_out_ids) {
141            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
142            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
143        }
144        // scale_res - scale the hidden states for residual connection
145        const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
146        cur = ggml_scale(ctx0, cur, scale_res);
147        cb(cur, "hidden_scaled", il);
148
149        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
150        cb(ffn_inp, "ffn_inp", il);
151
152        // feed-forward network
153        {
154            cur = build_norm(ffn_inp,
155                    model.layers[il].ffn_norm, NULL,
156                    LLM_NORM_RMS, il);
157            cb(cur, "ffn_norm", il);
158
159            cur = build_ffn(cur,
160                    model.layers[il].ffn_up,   NULL, NULL,
161                    model.layers[il].ffn_gate, NULL, NULL,
162                    model.layers[il].ffn_down, NULL, NULL,
163                    NULL,
164                    LLM_FFN_SILU, LLM_FFN_PAR, il);
165            cb(cur, "ffn_out", il);
166        }
167        // scale the hidden states for residual connection
168        cur = ggml_scale(ctx0, cur, scale_res);
169        cb(cur, "hidden_scaled_ffn", il);
170
171        cur = ggml_add(ctx0, cur, ffn_inp);
172
173        cur = build_cvec(cur, il);
174        cb(cur, "l_out", il);
175
176        // input for next layer
177        inpL = cur;
178    }
179    cur = inpL;
180
181    cur = build_norm(cur,
182            model.output_norm, NULL,
183            LLM_NORM_RMS, -1);
184
185    cb(cur, "result_norm", -1);
186    res->t_embd = cur;
187
188    // lm_head scaling
189    const float scale_lmhead = float(n_embd_base)/float(n_embd);
190    cur = ggml_scale(ctx0, cur, scale_lmhead);
191    cb(cur, "lmhead_scaling", -1);
192
193    // lm_head
194    cur = build_lora_mm(model.output, cur);
195
196    cb(cur, "result_output", -1);
197    res->t_logits = cur;
198
199    ggml_build_forward_expand(gf, cur);
200}