1#include "models.h"
  2
  3ggml_cgraph * clip_graph_whisper_enc::build() {
  4    const int n_frames = img.nx;
  5    const int n_pos    = n_frames / 2;
  6    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
  7
  8    ggml_tensor * inp = build_inp_raw(1);
  9
 10    // conv1d block
 11    {
 12        // convolution + gelu
 13        ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
 14        cur = ggml_add(ctx0, cur, model.conv1d_1_b);
 15
 16        cur = ggml_gelu_erf(ctx0, cur);
 17
 18        cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
 19        cur = ggml_add(ctx0, cur, model.conv1d_2_b);
 20
 21        cur = ggml_gelu_erf(ctx0, cur);
 22        // transpose
 23        inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 24        cb(inp, "after_conv1d", -1);
 25    }
 26
 27    // sanity check (only check one layer, but it should be the same for all)
 28    GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
 29    GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
 30    GGML_ASSERT(model.layers[0].q_b);
 31    GGML_ASSERT(model.layers[0].v_b);
 32    GGML_ASSERT(!model.layers[0].k_b); // no bias for k
 33
 34    ggml_tensor * pos_embd_selected = ggml_view_2d(
 35        ctx0, model.position_embeddings,
 36        model.position_embeddings->ne[0], n_pos,
 37        model.position_embeddings->nb[1], 0
 38    );
 39    ggml_tensor * cur = build_vit(
 40                            inp, n_pos,
 41                            NORM_TYPE_NORMAL,
 42                            hparams.ffn_op,
 43                            pos_embd_selected,
 44                            nullptr);
 45
 46    cb(cur, "after_transformer", -1);
 47
 48    if (model.audio_has_stack_frames()) {
 49        // StackAudioFrames
 50        // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
 51        cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
 52        cb(cur, "after_stacked", -1);
 53    }
 54
 55    if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
 56        // UltravoxProjector
 57        // pre-norm
 58        cur = ggml_rms_norm(ctx0, cur, 1e-6);
 59        cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
 60
 61        // ffn in
 62        cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
 63
 64        // swiglu
 65        // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
 66        cur = ggml_swiglu_swapped(ctx0, cur);
 67
 68        // mid-norm
 69        cur = ggml_rms_norm(ctx0, cur, 1e-6);
 70        cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
 71
 72        // ffn out
 73        cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
 74
 75    } else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
 76        // projector
 77        cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
 78        cur = ggml_add(ctx0, cur, model.mm_fc_b);
 79
 80    } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
 81        // projector
 82        cur = build_ffn(cur,
 83            model.mm_1_w, model.mm_1_b,
 84            nullptr, nullptr,
 85            model.mm_2_w, model.mm_2_b,
 86            FFN_GELU_ERF,
 87            -1);
 88
 89    } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
 90        // projector
 91        cur = build_ffn(cur,
 92            model.mm_1_w, model.mm_1_b,
 93            nullptr, nullptr,
 94            model.mm_2_w, model.mm_2_b,
 95            FFN_GELU_ERF,
 96            -1);
 97
 98    } else if (proj_type == PROJECTOR_TYPE_GLMA) {
 99            cur = ggml_norm(ctx0, cur, hparams.eps);
100            cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
101            cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
102            cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
103            cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
104            cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
105            cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
106    } else {
107        GGML_ABORT("%s: unknown projector type", __func__);
108    }
109
110    cb(cur, "projected", -1);
111
112    ggml_build_forward_expand(gf, cur);
113
114    return gf;
115}