diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/src/models/models.h | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/src/models/models.h')
| -rw-r--r-- | llama.cpp/src/models/models.h | 723 |
1 files changed, 723 insertions, 0 deletions
diff --git a/llama.cpp/src/models/models.h b/llama.cpp/src/models/models.h new file mode 100644 index 0000000..3c66d32 --- /dev/null +++ b/llama.cpp/src/models/models.h | |||
| @@ -0,0 +1,723 @@ | |||
| 1 | #pragma once | ||
| 2 | |||
| 3 | #include "../llama-model.h" | ||
| 4 | #include "../llama-graph.h" | ||
| 5 | |||
| 6 | // TODO: remove in follow-up PR - move to .cpp files | ||
| 7 | #include "../llama-memory-recurrent.h" | ||
| 8 | #include <cmath> | ||
| 9 | |||
| 10 | struct llm_graph_context_mamba : public llm_graph_context { | ||
| 11 | llm_graph_context_mamba(const llm_graph_params & params); | ||
| 12 | |||
| 13 | virtual ~llm_graph_context_mamba() = default; | ||
| 14 | |||
| 15 | ggml_tensor * build_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il); | ||
| 16 | ggml_tensor * build_mamba2_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il) const; | ||
| 17 | |||
| 18 | }; | ||
| 19 | |||
| 20 | // Base class for RWKV-related models | ||
| 21 | struct llm_build_rwkv6_base : public llm_graph_context { | ||
| 22 | const llama_model & model; | ||
| 23 | |||
| 24 | llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params); | ||
| 25 | |||
| 26 | virtual ~llm_build_rwkv6_base() = default; | ||
| 27 | |||
| 28 | ggml_tensor * build_rwkv6_channel_mix(const llama_layer * layer, | ||
| 29 | ggml_tensor * cur, | ||
| 30 | ggml_tensor * x_prev, | ||
| 31 | llm_arch arch) const; | ||
| 32 | |||
| 33 | ggml_tensor * build_rwkv6_time_mix(llm_graph_input_rs * inp, | ||
| 34 | ggml_tensor * cur, | ||
| 35 | ggml_tensor * x_prev, | ||
| 36 | const llama_ubatch & ubatch, | ||
| 37 | int il) const; | ||
| 38 | }; | ||
| 39 | |||
| 40 | // Base class for RWKV7-related models | ||
| 41 | struct llm_build_rwkv7_base : public llm_graph_context { | ||
| 42 | const llama_model & model; | ||
| 43 | |||
| 44 | llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params); | ||
| 45 | |||
| 46 | virtual ~llm_build_rwkv7_base() = default; | ||
| 47 | |||
| 48 | // RWKV7-specific graph building methods | ||
| 49 | ggml_tensor * build_rwkv7_channel_mix(const llama_layer * layer, | ||
| 50 | ggml_tensor * cur, | ||
| 51 | ggml_tensor * x_prev, | ||
| 52 | llm_arch arch) const; | ||
| 53 | ggml_tensor * build_rwkv7_time_mix(llm_graph_input_rs * inp, | ||
| 54 | ggml_tensor * cur, | ||
| 55 | ggml_tensor * x_prev, | ||
| 56 | ggml_tensor *& first_layer_value, | ||
| 57 | const llama_ubatch & ubatch, | ||
| 58 | int il) const; | ||
| 59 | }; | ||
| 60 | |||
| 61 | struct llm_build_afmoe : public llm_graph_context { | ||
| 62 | llm_build_afmoe(const llama_model & model, const llm_graph_params & params); | ||
| 63 | }; | ||
| 64 | |||
| 65 | struct llm_build_apertus : public llm_graph_context { | ||
| 66 | llm_build_apertus(const llama_model & model, const llm_graph_params & params); | ||
| 67 | }; | ||
| 68 | |||
| 69 | struct llm_build_arcee : public llm_graph_context { | ||
| 70 | llm_build_arcee(const llama_model & model, const llm_graph_params & params); | ||
| 71 | }; | ||
| 72 | |||
| 73 | struct llm_build_arctic : public llm_graph_context { | ||
| 74 | llm_build_arctic(const llama_model & model, const llm_graph_params & params); | ||
| 75 | }; | ||
| 76 | |||
| 77 | struct llm_build_arwkv7 : public llm_build_rwkv7_base { | ||
| 78 | llm_build_arwkv7(const llama_model & model, const llm_graph_params & params); | ||
| 79 | }; | ||
| 80 | |||
| 81 | struct llm_build_baichuan : public llm_graph_context { | ||
| 82 | llm_build_baichuan(const llama_model & model, const llm_graph_params & params); | ||
| 83 | }; | ||
| 84 | |||
| 85 | struct llm_build_bailingmoe2 : public llm_graph_context { | ||
| 86 | llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params); | ||
| 87 | }; | ||
| 88 | |||
| 89 | struct llm_build_bailingmoe : public llm_graph_context { | ||
| 90 | llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params); | ||
| 91 | }; | ||
| 92 | |||
| 93 | struct llm_build_bert : public llm_graph_context { | ||
| 94 | llm_build_bert(const llama_model & model, const llm_graph_params & params); | ||
| 95 | }; | ||
| 96 | |||
| 97 | struct llm_build_bitnet : public llm_graph_context { | ||
| 98 | llm_build_bitnet(const llama_model & model, const llm_graph_params & params); | ||
| 99 | }; | ||
| 100 | |||
| 101 | struct llm_build_bloom : public llm_graph_context { | ||
| 102 | llm_build_bloom(const llama_model & model, const llm_graph_params & params); | ||
| 103 | }; | ||
| 104 | |||
| 105 | struct llm_build_chameleon : public llm_graph_context { | ||
| 106 | llm_build_chameleon(const llama_model & model, const llm_graph_params & params); | ||
| 107 | }; | ||
| 108 | |||
| 109 | struct llm_build_chatglm : public llm_graph_context { | ||
| 110 | llm_build_chatglm(const llama_model & model, const llm_graph_params & params); | ||
| 111 | }; | ||
| 112 | |||
| 113 | struct llm_build_codeshell : public llm_graph_context { | ||
| 114 | llm_build_codeshell(const llama_model & model, const llm_graph_params & params); | ||
| 115 | }; | ||
| 116 | |||
| 117 | struct llm_build_cogvlm : public llm_graph_context { | ||
| 118 | llm_build_cogvlm(const llama_model & model, const llm_graph_params & params); | ||
| 119 | }; | ||
| 120 | |||
| 121 | struct llm_build_cohere2_iswa : public llm_graph_context { | ||
| 122 | llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params); | ||
| 123 | }; | ||
| 124 | |||
| 125 | struct llm_build_command_r : public llm_graph_context { | ||
| 126 | llm_build_command_r(const llama_model & model, const llm_graph_params & params); | ||
| 127 | }; | ||
| 128 | |||
| 129 | struct llm_build_dbrx : public llm_graph_context { | ||
| 130 | llm_build_dbrx(const llama_model & model, const llm_graph_params & params); | ||
| 131 | }; | ||
| 132 | |||
| 133 | struct llm_build_deci : public llm_graph_context { | ||
| 134 | llm_build_deci(const llama_model & model, const llm_graph_params & params); | ||
| 135 | }; | ||
| 136 | |||
| 137 | struct llm_build_deepseek2 : public llm_graph_context { | ||
| 138 | llm_build_deepseek2(const llama_model & model, const llm_graph_params & params); | ||
| 139 | }; | ||
| 140 | |||
| 141 | struct llm_build_deepseek : public llm_graph_context { | ||
| 142 | llm_build_deepseek(const llama_model & model, const llm_graph_params & params); | ||
| 143 | }; | ||
| 144 | |||
| 145 | struct llm_build_dots1 : public llm_graph_context { | ||
| 146 | llm_build_dots1(const llama_model & model, const llm_graph_params & params); | ||
| 147 | }; | ||
| 148 | |||
| 149 | struct llm_build_dream : public llm_graph_context { | ||
| 150 | llm_build_dream(const llama_model & model, const llm_graph_params & params); | ||
| 151 | }; | ||
| 152 | |||
| 153 | struct llm_build_ernie4_5 : public llm_graph_context { | ||
| 154 | llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params); | ||
| 155 | }; | ||
| 156 | |||
| 157 | struct llm_build_ernie4_5_moe : public llm_graph_context { | ||
| 158 | llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params); | ||
| 159 | }; | ||
| 160 | |||
| 161 | template <bool iswa> | ||
| 162 | struct llm_build_exaone4 : public llm_graph_context { | ||
| 163 | llm_build_exaone4(const llama_model & model, const llm_graph_params & params); | ||
| 164 | }; | ||
| 165 | |||
| 166 | struct llm_build_exaone : public llm_graph_context { | ||
| 167 | llm_build_exaone(const llama_model & model, const llm_graph_params & params); | ||
| 168 | }; | ||
| 169 | |||
| 170 | struct llm_build_exaone_moe : public llm_graph_context { | ||
| 171 | llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params); | ||
| 172 | }; | ||
| 173 | |||
| 174 | struct llm_build_falcon : public llm_graph_context { | ||
| 175 | llm_build_falcon(const llama_model & model, const llm_graph_params & params); | ||
| 176 | }; | ||
| 177 | |||
| 178 | struct llm_build_falcon_h1 : public llm_graph_context_mamba { | ||
| 179 | llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params); | ||
| 180 | }; | ||
| 181 | |||
| 182 | struct llm_build_gemma2_iswa : public llm_graph_context { | ||
| 183 | llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params); | ||
| 184 | }; | ||
| 185 | |||
| 186 | template <bool iswa> | ||
| 187 | struct llm_build_gemma3 : public llm_graph_context { | ||
| 188 | llm_build_gemma3(const llama_model & model, const llm_graph_params & params); | ||
| 189 | }; | ||
| 190 | |||
| 191 | struct llm_build_gemma3n_iswa : public llm_graph_context { | ||
| 192 | const llama_model & model; | ||
| 193 | |||
| 194 | const int64_t n_embd_head; | ||
| 195 | const int64_t n_embd_altup; | ||
| 196 | const int64_t n_altup; | ||
| 197 | const int i_altup_act; | ||
| 198 | const int n_layer_sparsity = 10; // number of layers using activation sparsity | ||
| 199 | const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95) | ||
| 200 | |||
| 201 | llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params); | ||
| 202 | ggml_tensor * calc_magnitude(ggml_tensor * x); | ||
| 203 | ggml_tensor * view_2d_slice(ggml_tensor * x, int idx); | ||
| 204 | ggml_tensor * get_per_layer_inputs(); | ||
| 205 | ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer); | ||
| 206 | ggml_tensor * gaussian_topk(ggml_tensor * x); | ||
| 207 | ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il); | ||
| 208 | ggml_tensor * altup_predict(ggml_tensor * cur, int il); | ||
| 209 | ggml_tensor * laurel(ggml_tensor * cur, int il); | ||
| 210 | ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il); | ||
| 211 | }; | ||
| 212 | |||
| 213 | struct llm_build_gemma_embedding : public llm_graph_context { | ||
| 214 | llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params); | ||
| 215 | }; | ||
| 216 | |||
| 217 | struct llm_build_gemma : public llm_graph_context { | ||
| 218 | llm_build_gemma(const llama_model & model, const llm_graph_params & params); | ||
| 219 | }; | ||
| 220 | |||
| 221 | struct llm_build_glm4 : public llm_graph_context { | ||
| 222 | llm_build_glm4(const llama_model & model, const llm_graph_params & params); | ||
| 223 | }; | ||
| 224 | |||
| 225 | struct llm_build_glm4_moe : public llm_graph_context { | ||
| 226 | llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params); | ||
| 227 | }; | ||
| 228 | |||
| 229 | struct llm_build_gpt2 : public llm_graph_context { | ||
| 230 | llm_build_gpt2(const llama_model & model, const llm_graph_params & params); | ||
| 231 | }; | ||
| 232 | |||
| 233 | struct llm_build_gptneox : public llm_graph_context { | ||
| 234 | llm_build_gptneox(const llama_model & model, const llm_graph_params & params); | ||
| 235 | }; | ||
| 236 | |||
| 237 | struct llm_build_granite : public llm_graph_context { | ||
| 238 | llm_build_granite(const llama_model & model, const llm_graph_params & params); | ||
| 239 | |||
| 240 | private: | ||
| 241 | ggml_tensor * build_attention_layer( | ||
| 242 | ggml_tensor * cur, | ||
| 243 | ggml_tensor * inp_pos, | ||
| 244 | llm_graph_input_attn_kv * inp_attn, | ||
| 245 | const llama_model & model, | ||
| 246 | const int64_t n_embd_head, | ||
| 247 | const int il); | ||
| 248 | |||
| 249 | ggml_tensor * build_layer_ffn( | ||
| 250 | ggml_tensor * cur, | ||
| 251 | ggml_tensor * inpSA, | ||
| 252 | const llama_model & model, | ||
| 253 | const int il); | ||
| 254 | }; | ||
| 255 | |||
| 256 | struct llm_build_granite_hybrid : public llm_graph_context_mamba { | ||
| 257 | llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params); | ||
| 258 | ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il); | ||
| 259 | ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, | ||
| 260 | const llama_model & model,const int64_t n_embd_head, const int il); | ||
| 261 | }; | ||
| 262 | |||
| 263 | struct llm_build_grok : public llm_graph_context { | ||
| 264 | llm_build_grok(const llama_model & model, const llm_graph_params & params); | ||
| 265 | }; | ||
| 266 | |||
| 267 | struct llm_build_grovemoe : public llm_graph_context { | ||
| 268 | llm_build_grovemoe(const llama_model & model, const llm_graph_params & params); | ||
| 269 | }; | ||
| 270 | |||
| 271 | struct llm_build_hunyuan_dense : public llm_graph_context { | ||
| 272 | llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params); | ||
| 273 | }; | ||
| 274 | |||
| 275 | struct llm_build_hunyuan_moe : public llm_graph_context { | ||
| 276 | llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params); | ||
| 277 | }; | ||
| 278 | |||
| 279 | struct llm_build_internlm2 : public llm_graph_context { | ||
| 280 | llm_build_internlm2(const llama_model & model, const llm_graph_params & params); | ||
| 281 | }; | ||
| 282 | |||
| 283 | struct llm_build_jais : public llm_graph_context { | ||
| 284 | llm_build_jais(const llama_model & model, const llm_graph_params & params); | ||
| 285 | }; | ||
| 286 | |||
| 287 | struct llm_build_jamba : public llm_graph_context_mamba { | ||
| 288 | llm_build_jamba(const llama_model & model, const llm_graph_params & params); | ||
| 289 | }; | ||
| 290 | |||
| 291 | struct llm_build_kimi_linear : public llm_graph_context_mamba { | ||
| 292 | llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params); | ||
| 293 | |||
| 294 | std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive( | ||
| 295 | ggml_tensor * q, | ||
| 296 | ggml_tensor * k, | ||
| 297 | ggml_tensor * v, | ||
| 298 | ggml_tensor * gk, | ||
| 299 | ggml_tensor * beta, | ||
| 300 | ggml_tensor * state, | ||
| 301 | int il); | ||
| 302 | |||
| 303 | std::pair<ggml_tensor *, ggml_tensor *> build_kda_chunking( | ||
| 304 | ggml_tensor * q, | ||
| 305 | ggml_tensor * k, | ||
| 306 | ggml_tensor * v, | ||
| 307 | ggml_tensor * gk, | ||
| 308 | ggml_tensor * beta, | ||
| 309 | ggml_tensor * state, | ||
| 310 | ggml_tensor * causal_mask, | ||
| 311 | ggml_tensor * identity, | ||
| 312 | ggml_tensor * diag_mask, | ||
| 313 | int il); | ||
| 314 | |||
| 315 | const llama_model & model; | ||
| 316 | }; | ||
| 317 | |||
| 318 | struct llm_build_lfm2 : public llm_graph_context { | ||
| 319 | const llama_model & model; | ||
| 320 | |||
| 321 | llm_build_lfm2(const llama_model & model, const llm_graph_params & params); | ||
| 322 | ggml_tensor * build_moe_feed_forward(ggml_tensor * cur, int il) const; | ||
| 323 | ggml_tensor * build_dense_feed_forward(ggml_tensor * cur, int il) const; | ||
| 324 | ggml_tensor * build_attn_block(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, int il) const; | ||
| 325 | ggml_tensor * build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il); | ||
| 326 | |||
| 327 | }; | ||
| 328 | |||
| 329 | struct llm_build_llada : public llm_graph_context { | ||
| 330 | llm_build_llada(const llama_model & model, const llm_graph_params & params); | ||
| 331 | }; | ||
| 332 | |||
| 333 | struct llm_build_llada_moe : public llm_graph_context { | ||
| 334 | llm_build_llada_moe(const llama_model & model, const llm_graph_params & params); | ||
| 335 | }; | ||
| 336 | |||
| 337 | template <bool embed> | ||
| 338 | struct llm_build_llama : public llm_graph_context { | ||
| 339 | llm_build_llama(const llama_model & model, const llm_graph_params & params); | ||
| 340 | }; | ||
| 341 | |||
| 342 | struct llm_build_llama_iswa : public llm_graph_context { | ||
| 343 | llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params); | ||
| 344 | }; | ||
| 345 | |||
| 346 | struct llm_build_maincoder : public llm_graph_context { | ||
| 347 | llm_build_maincoder(const llama_model & model, const llm_graph_params & params); | ||
| 348 | }; | ||
| 349 | |||
| 350 | struct llm_build_mamba : public llm_graph_context_mamba { | ||
| 351 | llm_build_mamba(const llama_model & model, const llm_graph_params & params); | ||
| 352 | }; | ||
| 353 | |||
| 354 | struct llm_build_mimo2_iswa : public llm_graph_context { | ||
| 355 | llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params); | ||
| 356 | }; | ||
| 357 | |||
| 358 | struct llm_build_minicpm3 : public llm_graph_context { | ||
| 359 | llm_build_minicpm3(const llama_model & model, const llm_graph_params & params); | ||
| 360 | }; | ||
| 361 | |||
| 362 | struct llm_build_minimax_m2 : public llm_graph_context { | ||
| 363 | llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params); | ||
| 364 | }; | ||
| 365 | |||
| 366 | struct llm_build_mistral3 : public llm_graph_context { | ||
| 367 | llm_build_mistral3(const llama_model & model, const llm_graph_params & params); | ||
| 368 | }; | ||
| 369 | |||
| 370 | struct llm_build_modern_bert : public llm_graph_context { | ||
| 371 | llm_build_modern_bert(const llama_model & model, const llm_graph_params & params); | ||
| 372 | }; | ||
| 373 | |||
| 374 | struct llm_build_mpt : public llm_graph_context { | ||
| 375 | llm_build_mpt(const llama_model & model, const llm_graph_params & params); | ||
| 376 | }; | ||
| 377 | |||
| 378 | struct llm_build_nemotron : public llm_graph_context { | ||
| 379 | llm_build_nemotron(const llama_model & model, const llm_graph_params & params); | ||
| 380 | }; | ||
| 381 | |||
| 382 | struct llm_build_nemotron_h : public llm_graph_context_mamba { | ||
| 383 | llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params); | ||
| 384 | ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il); | ||
| 385 | ggml_tensor * build_attention_layer(ggml_tensor * cur, llm_graph_input_attn_kv * inp_attn, | ||
| 386 | const llama_model & model, const int64_t n_embd_head, const int il); | ||
| 387 | }; | ||
| 388 | |||
| 389 | struct llm_build_neo_bert : public llm_graph_context { | ||
| 390 | llm_build_neo_bert(const llama_model & model, const llm_graph_params & params); | ||
| 391 | }; | ||
| 392 | |||
| 393 | template <bool iswa> | ||
| 394 | struct llm_build_olmo2 : public llm_graph_context { | ||
| 395 | llm_build_olmo2(const llama_model & model, const llm_graph_params & params); | ||
| 396 | }; | ||
| 397 | |||
| 398 | struct llm_build_olmoe : public llm_graph_context { | ||
| 399 | llm_build_olmoe(const llama_model & model, const llm_graph_params & params); | ||
| 400 | }; | ||
| 401 | |||
| 402 | struct llm_build_olmo : public llm_graph_context { | ||
| 403 | llm_build_olmo(const llama_model & model, const llm_graph_params & params); | ||
| 404 | }; | ||
| 405 | |||
| 406 | struct llm_build_openai_moe_iswa : public llm_graph_context { | ||
| 407 | llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params); | ||
| 408 | }; | ||
| 409 | |||
| 410 | struct llm_build_openelm : public llm_graph_context { | ||
| 411 | llm_build_openelm(const llama_model & model, const llm_graph_params & params); | ||
| 412 | }; | ||
| 413 | |||
| 414 | struct llm_build_orion : public llm_graph_context { | ||
| 415 | llm_build_orion(const llama_model & model, const llm_graph_params & params); | ||
| 416 | }; | ||
| 417 | |||
| 418 | struct llm_build_pangu_embedded : public llm_graph_context { | ||
| 419 | llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params); | ||
| 420 | }; | ||
| 421 | |||
| 422 | struct llm_build_phi2 : public llm_graph_context { | ||
| 423 | llm_build_phi2(const llama_model & model, const llm_graph_params & params); | ||
| 424 | }; | ||
| 425 | |||
| 426 | template<bool iswa> | ||
| 427 | struct llm_build_phi3 : public llm_graph_context { | ||
| 428 | llm_build_phi3(const llama_model & model, const llm_graph_params & params); | ||
| 429 | }; | ||
| 430 | |||
| 431 | struct llm_build_plamo2 : public llm_graph_context_mamba { | ||
| 432 | llm_build_plamo2(const llama_model & model, const llm_graph_params & params); | ||
| 433 | private: | ||
| 434 | ggml_tensor * build_plamo2_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il); | ||
| 435 | ggml_tensor * build_plamo2_attn_layer(llm_graph_input_attn_kv * inp, ggml_tensor * inp_pos, ggml_tensor * cur, | ||
| 436 | const llama_model & model, int il); | ||
| 437 | }; | ||
| 438 | |||
| 439 | struct llm_build_plamo : public llm_graph_context { | ||
| 440 | llm_build_plamo(const llama_model & model, const llm_graph_params & params); | ||
| 441 | }; | ||
| 442 | |||
| 443 | template <bool iswa> | ||
| 444 | struct llm_build_plamo3 : public llm_graph_context { | ||
| 445 | llm_build_plamo3(const llama_model & model, const llm_graph_params & params); | ||
| 446 | }; | ||
| 447 | |||
| 448 | struct llm_build_plm : public llm_graph_context { | ||
| 449 | llm_build_plm(const llama_model & model, const llm_graph_params & params); | ||
| 450 | }; | ||
| 451 | |||
| 452 | struct llm_build_qwen2 : public llm_graph_context { | ||
| 453 | llm_build_qwen2(const llama_model & model, const llm_graph_params & params); | ||
| 454 | }; | ||
| 455 | |||
| 456 | struct llm_build_qwen2moe : public llm_graph_context { | ||
| 457 | llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params); | ||
| 458 | }; | ||
| 459 | |||
| 460 | struct llm_build_qwen2vl : public llm_graph_context { | ||
| 461 | llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params); | ||
| 462 | }; | ||
| 463 | |||
| 464 | struct llm_build_qwen3 : public llm_graph_context { | ||
| 465 | llm_build_qwen3(const llama_model & model, const llm_graph_params & params); | ||
| 466 | }; | ||
| 467 | |||
| 468 | struct llm_build_qwen3moe : public llm_graph_context { | ||
| 469 | llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params); | ||
| 470 | }; | ||
| 471 | |||
| 472 | struct llm_build_qwen3vl : public llm_graph_context { | ||
| 473 | llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params); | ||
| 474 | }; | ||
| 475 | |||
| 476 | struct llm_build_qwen3vlmoe : public llm_graph_context { | ||
| 477 | llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params); | ||
| 478 | }; | ||
| 479 | |||
| 480 | struct llm_build_qwen3next : public llm_graph_context_mamba { | ||
| 481 | llm_build_qwen3next(const llama_model & model, const llm_graph_params & params); | ||
| 482 | private: | ||
| 483 | ggml_tensor * build_layer_attn( | ||
| 484 | llm_graph_input_attn_kv * inp_attn, | ||
| 485 | ggml_tensor * cur, | ||
| 486 | ggml_tensor * inp_pos, | ||
| 487 | int il); | ||
| 488 | |||
| 489 | ggml_tensor * build_layer_attn_linear( | ||
| 490 | llm_graph_input_rs * inp, | ||
| 491 | ggml_tensor * cur, | ||
| 492 | ggml_tensor * causal_mask, | ||
| 493 | ggml_tensor * identity, | ||
| 494 | ggml_tensor * diag_mask, | ||
| 495 | int il); | ||
| 496 | |||
| 497 | ggml_tensor * build_layer_ffn( | ||
| 498 | ggml_tensor * cur, | ||
| 499 | int il); | ||
| 500 | |||
| 501 | // returns pair of output and new state | ||
| 502 | std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking( | ||
| 503 | ggml_tensor * q, | ||
| 504 | ggml_tensor * k, | ||
| 505 | ggml_tensor * v, | ||
| 506 | ggml_tensor * g, | ||
| 507 | ggml_tensor * beta, | ||
| 508 | ggml_tensor * state, | ||
| 509 | ggml_tensor * causal_mask, | ||
| 510 | ggml_tensor * identity, | ||
| 511 | ggml_tensor * diag_mask, | ||
| 512 | int il); | ||
| 513 | |||
| 514 | // returns pair of output and new state | ||
| 515 | std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive( | ||
| 516 | ggml_tensor * q, | ||
| 517 | ggml_tensor * k, | ||
| 518 | ggml_tensor * v, | ||
| 519 | ggml_tensor * g, | ||
| 520 | ggml_tensor * beta, | ||
| 521 | ggml_tensor * state, | ||
| 522 | int il); | ||
| 523 | |||
| 524 | ggml_tensor * build_norm_gated( | ||
| 525 | ggml_tensor * input, | ||
| 526 | ggml_tensor * weights, | ||
| 527 | ggml_tensor * gate, | ||
| 528 | int layer); | ||
| 529 | |||
| 530 | // returns pair of qkv, z | ||
| 531 | std::pair<ggml_tensor *, ggml_tensor *> build_qkvz( | ||
| 532 | ggml_tensor * input, | ||
| 533 | int il); | ||
| 534 | |||
| 535 | const llama_model & model; | ||
| 536 | }; | ||
| 537 | |||
| 538 | struct llm_build_qwen35 : public llm_graph_context_mamba { | ||
| 539 | llm_build_qwen35(const llama_model & model, const llm_graph_params & params); | ||
| 540 | private: | ||
| 541 | ggml_tensor * build_layer_attn( | ||
| 542 | llm_graph_input_attn_kv * inp_attn, | ||
| 543 | ggml_tensor * cur, | ||
| 544 | ggml_tensor * inp_pos, | ||
| 545 | int * sections, | ||
| 546 | int il); | ||
| 547 | |||
| 548 | ggml_tensor * build_layer_attn_linear( | ||
| 549 | llm_graph_input_rs * inp, | ||
| 550 | ggml_tensor * cur, | ||
| 551 | ggml_tensor * causal_mask, | ||
| 552 | ggml_tensor * identity, | ||
| 553 | ggml_tensor * diag_mask, | ||
| 554 | int il); | ||
| 555 | |||
| 556 | ggml_tensor * build_layer_ffn( | ||
| 557 | ggml_tensor * cur, | ||
| 558 | int il); | ||
| 559 | |||
| 560 | // returns pair of output and new state | ||
| 561 | std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking( | ||
| 562 | ggml_tensor * q, | ||
| 563 | ggml_tensor * k, | ||
| 564 | ggml_tensor * v, | ||
| 565 | ggml_tensor * g, | ||
| 566 | ggml_tensor * beta, | ||
| 567 | ggml_tensor * state, | ||
| 568 | ggml_tensor * causal_mask, | ||
| 569 | ggml_tensor * identity, | ||
| 570 | ggml_tensor * diag_mask, | ||
| 571 | int il); | ||
| 572 | |||
| 573 | // returns pair of output and new state | ||
| 574 | std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive( | ||
| 575 | ggml_tensor * q, | ||
| 576 | ggml_tensor * k, | ||
| 577 | ggml_tensor * v, | ||
| 578 | ggml_tensor * g, | ||
| 579 | ggml_tensor * beta, | ||
| 580 | ggml_tensor * state, | ||
| 581 | int il); | ||
| 582 | |||
| 583 | ggml_tensor * build_norm_gated( | ||
| 584 | ggml_tensor * input, | ||
| 585 | ggml_tensor * weights, | ||
| 586 | ggml_tensor * gate, | ||
| 587 | int layer); | ||
| 588 | |||
| 589 | // returns pair of qkv, z | ||
| 590 | std::pair<ggml_tensor *, ggml_tensor *> build_qkvz( | ||
| 591 | ggml_tensor * input, | ||
| 592 | int il); | ||
| 593 | |||
| 594 | const llama_model & model; | ||
| 595 | }; | ||
| 596 | |||
| 597 | struct llm_build_qwen35moe : public llm_graph_context_mamba { | ||
| 598 | llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params); | ||
| 599 | private: | ||
| 600 | ggml_tensor * build_layer_attn( | ||
| 601 | llm_graph_input_attn_kv * inp_attn, | ||
| 602 | ggml_tensor * cur, | ||
| 603 | ggml_tensor * inp_pos, | ||
| 604 | int * sections, | ||
| 605 | int il); | ||
| 606 | |||
| 607 | ggml_tensor * build_layer_attn_linear( | ||
| 608 | llm_graph_input_rs * inp, | ||
| 609 | ggml_tensor * cur, | ||
| 610 | ggml_tensor * causal_mask, | ||
| 611 | ggml_tensor * identity, | ||
| 612 | ggml_tensor * diag_mask, | ||
| 613 | int il); | ||
| 614 | |||
| 615 | ggml_tensor * build_layer_ffn( | ||
| 616 | ggml_tensor * cur, | ||
| 617 | int il); | ||
| 618 | |||
| 619 | // returns pair of output and new state | ||
| 620 | std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking( | ||
| 621 | ggml_tensor * q, | ||
| 622 | ggml_tensor * k, | ||
| 623 | ggml_tensor * v, | ||
| 624 | ggml_tensor * g, | ||
| 625 | ggml_tensor * beta, | ||
| 626 | ggml_tensor * state, | ||
| 627 | ggml_tensor * causal_mask, | ||
| 628 | ggml_tensor * identity, | ||
| 629 | ggml_tensor * diag_mask, | ||
| 630 | int il); | ||
| 631 | |||
| 632 | // returns pair of output and new state | ||
| 633 | std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive( | ||
| 634 | ggml_tensor * q, | ||
| 635 | ggml_tensor * k, | ||
| 636 | ggml_tensor * v, | ||
| 637 | ggml_tensor * g, | ||
| 638 | ggml_tensor * beta, | ||
| 639 | ggml_tensor * state, | ||
| 640 | int il); | ||
| 641 | |||
| 642 | ggml_tensor * build_norm_gated( | ||
| 643 | ggml_tensor * input, | ||
| 644 | ggml_tensor * weights, | ||
| 645 | ggml_tensor * gate, | ||
| 646 | int layer); | ||
| 647 | |||
| 648 | // returns pair of qkv, z | ||
| 649 | std::pair<ggml_tensor *, ggml_tensor *> build_qkvz( | ||
| 650 | ggml_tensor * input, | ||
| 651 | int il); | ||
| 652 | |||
| 653 | const llama_model & model; | ||
| 654 | }; | ||
| 655 | |||
| 656 | struct llm_build_qwen : public llm_graph_context { | ||
| 657 | llm_build_qwen(const llama_model & model, const llm_graph_params & params); | ||
| 658 | }; | ||
| 659 | |||
| 660 | struct llm_build_refact : public llm_graph_context { | ||
| 661 | llm_build_refact(const llama_model & model, const llm_graph_params & params); | ||
| 662 | }; | ||
| 663 | |||
| 664 | struct llm_build_rnd1 : public llm_graph_context { | ||
| 665 | llm_build_rnd1(const llama_model & model, const llm_graph_params & params); | ||
| 666 | }; | ||
| 667 | |||
| 668 | struct llm_build_rwkv6 : public llm_build_rwkv6_base { | ||
| 669 | llm_build_rwkv6(const llama_model & model, const llm_graph_params & params); | ||
| 670 | }; | ||
| 671 | |||
| 672 | struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { | ||
| 673 | llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params); | ||
| 674 | }; | ||
| 675 | |||
| 676 | struct llm_build_rwkv7 : public llm_build_rwkv7_base { | ||
| 677 | llm_build_rwkv7(const llama_model & model, const llm_graph_params & params); | ||
| 678 | }; | ||
| 679 | |||
| 680 | struct llm_build_seed_oss : public llm_graph_context { | ||
| 681 | llm_build_seed_oss(const llama_model & model, const llm_graph_params & params); | ||
| 682 | }; | ||
| 683 | |||
| 684 | template <bool iswa> | ||
| 685 | struct llm_build_smallthinker : public llm_graph_context { | ||
| 686 | llm_build_smallthinker(const llama_model & model, const llm_graph_params & params); | ||
| 687 | }; | ||
| 688 | |||
| 689 | struct llm_build_smollm3 : public llm_graph_context { | ||
| 690 | llm_build_smollm3(const llama_model & model, const llm_graph_params & params); | ||
| 691 | }; | ||
| 692 | |||
| 693 | struct llm_build_stablelm : public llm_graph_context { | ||
| 694 | llm_build_stablelm(const llama_model & model, const llm_graph_params & params); | ||
| 695 | }; | ||
| 696 | |||
| 697 | struct llm_build_starcoder2 : public llm_graph_context { | ||
| 698 | llm_build_starcoder2(const llama_model & model, const llm_graph_params & params); | ||
| 699 | }; | ||
| 700 | |||
| 701 | struct llm_build_starcoder : public llm_graph_context { | ||
| 702 | llm_build_starcoder(const llama_model & model, const llm_graph_params & params); | ||
| 703 | }; | ||
| 704 | |||
| 705 | struct llm_build_step35_iswa : public llm_graph_context { | ||
| 706 | llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params); | ||
| 707 | }; | ||
| 708 | |||
| 709 | struct llm_build_t5_dec : public llm_graph_context { | ||
| 710 | llm_build_t5_dec(const llama_model & model, const llm_graph_params & params); | ||
| 711 | }; | ||
| 712 | |||
| 713 | struct llm_build_t5_enc : public llm_graph_context { | ||
| 714 | llm_build_t5_enc(const llama_model & model, const llm_graph_params & params); | ||
| 715 | }; | ||
| 716 | |||
| 717 | struct llm_build_wavtokenizer_dec : public llm_graph_context { | ||
| 718 | llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params); | ||
| 719 | }; | ||
| 720 | |||
| 721 | struct llm_build_xverse : public llm_graph_context { | ||
| 722 | llm_build_xverse(const llama_model & model, const llm_graph_params & params); | ||
| 723 | }; | ||
