1#include "models.h"
2
3ggml_cgraph * clip_graph_whisper_enc::build() {
4 const int n_frames = img.nx;
5 const int n_pos = n_frames / 2;
6 GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
7
8 ggml_tensor * inp = build_inp_raw(1);
9
10 // conv1d block
11 {
12 // convolution + gelu
13 ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
14 cur = ggml_add(ctx0, cur, model.conv1d_1_b);
15
16 cur = ggml_gelu_erf(ctx0, cur);
17
18 cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
19 cur = ggml_add(ctx0, cur, model.conv1d_2_b);
20
21 cur = ggml_gelu_erf(ctx0, cur);
22 // transpose
23 inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
24 cb(inp, "after_conv1d", -1);
25 }
26
27 // sanity check (only check one layer, but it should be the same for all)
28 GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
29 GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
30 GGML_ASSERT(model.layers[0].q_b);
31 GGML_ASSERT(model.layers[0].v_b);
32 GGML_ASSERT(!model.layers[0].k_b); // no bias for k
33
34 ggml_tensor * pos_embd_selected = ggml_view_2d(
35 ctx0, model.position_embeddings,
36 model.position_embeddings->ne[0], n_pos,
37 model.position_embeddings->nb[1], 0
38 );
39 ggml_tensor * cur = build_vit(
40 inp, n_pos,
41 NORM_TYPE_NORMAL,
42 hparams.ffn_op,
43 pos_embd_selected,
44 nullptr);
45
46 cb(cur, "after_transformer", -1);
47
48 if (model.audio_has_stack_frames()) {
49 // StackAudioFrames
50 // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
51 cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
52 cb(cur, "after_stacked", -1);
53 }
54
55 if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
56 // UltravoxProjector
57 // pre-norm
58 cur = ggml_rms_norm(ctx0, cur, 1e-6);
59 cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
60
61 // ffn in
62 cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
63
64 // swiglu
65 // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
66 cur = ggml_swiglu_swapped(ctx0, cur);
67
68 // mid-norm
69 cur = ggml_rms_norm(ctx0, cur, 1e-6);
70 cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
71
72 // ffn out
73 cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
74
75 } else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
76 // projector
77 cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
78 cur = ggml_add(ctx0, cur, model.mm_fc_b);
79
80 } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
81 // projector
82 cur = build_ffn(cur,
83 model.mm_1_w, model.mm_1_b,
84 nullptr, nullptr,
85 model.mm_2_w, model.mm_2_b,
86 FFN_GELU_ERF,
87 -1);
88
89 } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
90 // projector
91 cur = build_ffn(cur,
92 model.mm_1_w, model.mm_1_b,
93 nullptr, nullptr,
94 model.mm_2_w, model.mm_2_b,
95 FFN_GELU_ERF,
96 -1);
97
98 } else if (proj_type == PROJECTOR_TYPE_GLMA) {
99 cur = ggml_norm(ctx0, cur, hparams.eps);
100 cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
101 cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
102 cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
103 cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
104 cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
105 cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
106 } else {
107 GGML_ABORT("%s: unknown projector type", __func__);
108 }
109
110 cb(cur, "projected", -1);
111
112 ggml_build_forward_expand(gf, cur);
113
114 return gf;
115}