llmnpc - llama.cpp/src/models/models.h

Path: llmnpc / llama.cpp / src / models / models.h (raw)
  1#pragma once
  2
  3#include "../llama-model.h"
  4#include "../llama-graph.h"
  5
  6// TODO: remove in follow-up PR - move to .cpp files
  7#include "../llama-memory-recurrent.h"
  8#include <cmath>
  9
 10struct llm_graph_context_mamba : public llm_graph_context {
 11    llm_graph_context_mamba(const llm_graph_params & params);
 12
 13    virtual ~llm_graph_context_mamba() = default;
 14
 15    ggml_tensor * build_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
 16    ggml_tensor * build_mamba2_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il) const;
 17
 18};
 19
 20// Base class for RWKV-related models
 21struct llm_build_rwkv6_base : public llm_graph_context {
 22    const llama_model & model;
 23
 24    llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params);
 25
 26    virtual ~llm_build_rwkv6_base() = default;
 27
 28    ggml_tensor * build_rwkv6_channel_mix(const llama_layer * layer,
 29                                          ggml_tensor *       cur,
 30                                          ggml_tensor *       x_prev,
 31                                          llm_arch            arch) const;
 32
 33    ggml_tensor * build_rwkv6_time_mix(llm_graph_input_rs * inp,
 34                                       ggml_tensor *        cur,
 35                                       ggml_tensor *        x_prev,
 36                                       const llama_ubatch & ubatch,
 37                                       int                  il) const;
 38};
 39
 40// Base class for RWKV7-related models
 41struct llm_build_rwkv7_base : public llm_graph_context {
 42    const llama_model & model;
 43
 44    llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params);
 45
 46    virtual ~llm_build_rwkv7_base() = default;
 47
 48    // RWKV7-specific graph building methods
 49    ggml_tensor * build_rwkv7_channel_mix(const llama_layer * layer,
 50                                          ggml_tensor *       cur,
 51                                          ggml_tensor *       x_prev,
 52                                          llm_arch            arch) const;
 53    ggml_tensor * build_rwkv7_time_mix(llm_graph_input_rs * inp,
 54                                       ggml_tensor *        cur,
 55                                       ggml_tensor *        x_prev,
 56                                       ggml_tensor *&       first_layer_value,
 57                                       const llama_ubatch & ubatch,
 58                                       int                  il) const;
 59};
 60
 61struct llm_build_afmoe : public llm_graph_context {
 62    llm_build_afmoe(const llama_model & model, const llm_graph_params & params);
 63};
 64
 65struct llm_build_apertus : public llm_graph_context {
 66    llm_build_apertus(const llama_model & model, const llm_graph_params & params);
 67};
 68
 69struct llm_build_arcee : public llm_graph_context {
 70    llm_build_arcee(const llama_model & model, const llm_graph_params & params);
 71};
 72
 73struct llm_build_arctic : public llm_graph_context {
 74    llm_build_arctic(const llama_model & model, const llm_graph_params & params);
 75};
 76
 77struct llm_build_arwkv7 : public llm_build_rwkv7_base {
 78    llm_build_arwkv7(const llama_model & model, const llm_graph_params & params);
 79};
 80
 81struct llm_build_baichuan : public llm_graph_context {
 82    llm_build_baichuan(const llama_model & model, const llm_graph_params & params);
 83};
 84
 85struct llm_build_bailingmoe2 : public llm_graph_context {
 86    llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params);
 87};
 88
 89struct llm_build_bailingmoe : public llm_graph_context {
 90    llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params);
 91};
 92
 93struct llm_build_bert : public llm_graph_context {
 94    llm_build_bert(const llama_model & model, const llm_graph_params & params);
 95};
 96
 97struct llm_build_bitnet : public llm_graph_context {
 98    llm_build_bitnet(const llama_model & model, const llm_graph_params & params);
 99};
100
101struct llm_build_bloom : public llm_graph_context {
102    llm_build_bloom(const llama_model & model, const llm_graph_params & params);
103};
104
105struct llm_build_chameleon : public llm_graph_context {
106    llm_build_chameleon(const llama_model & model, const llm_graph_params & params);
107};
108
109struct llm_build_chatglm : public llm_graph_context {
110    llm_build_chatglm(const llama_model & model, const llm_graph_params & params);
111};
112
113struct llm_build_codeshell : public llm_graph_context {
114    llm_build_codeshell(const llama_model & model, const llm_graph_params & params);
115};
116
117struct llm_build_cogvlm : public llm_graph_context {
118    llm_build_cogvlm(const llama_model & model, const llm_graph_params & params);
119};
120
121struct llm_build_cohere2_iswa : public llm_graph_context {
122    llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params);
123};
124
125struct llm_build_command_r : public llm_graph_context {
126    llm_build_command_r(const llama_model & model, const llm_graph_params & params);
127};
128
129struct llm_build_dbrx : public llm_graph_context {
130    llm_build_dbrx(const llama_model & model, const llm_graph_params & params);
131};
132
133struct llm_build_deci : public llm_graph_context {
134    llm_build_deci(const llama_model & model, const llm_graph_params & params);
135};
136
137struct llm_build_deepseek2 : public llm_graph_context {
138    llm_build_deepseek2(const llama_model & model, const llm_graph_params & params);
139};
140
141struct llm_build_deepseek : public llm_graph_context {
142    llm_build_deepseek(const llama_model & model, const llm_graph_params & params);
143};
144
145struct llm_build_dots1 : public llm_graph_context {
146    llm_build_dots1(const llama_model & model, const llm_graph_params & params);
147};
148
149struct llm_build_dream : public llm_graph_context {
150    llm_build_dream(const llama_model & model, const llm_graph_params & params);
151};
152
153struct llm_build_ernie4_5 : public llm_graph_context {
154    llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params);
155};
156
157struct llm_build_ernie4_5_moe : public llm_graph_context {
158    llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params);
159};
160
161template <bool iswa>
162struct llm_build_exaone4 : public llm_graph_context {
163    llm_build_exaone4(const llama_model & model, const llm_graph_params & params);
164};
165
166struct llm_build_exaone : public llm_graph_context {
167    llm_build_exaone(const llama_model & model, const llm_graph_params & params);
168};
169
170struct llm_build_exaone_moe : public llm_graph_context {
171    llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
172};
173
174struct llm_build_falcon : public llm_graph_context {
175    llm_build_falcon(const llama_model & model, const llm_graph_params & params);
176};
177
178struct llm_build_falcon_h1 : public llm_graph_context_mamba {
179    llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params);
180};
181
182struct llm_build_gemma2_iswa : public llm_graph_context {
183    llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
184};
185
186template <bool iswa>
187struct llm_build_gemma3 : public llm_graph_context {
188    llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
189};
190
191struct llm_build_gemma3n_iswa : public llm_graph_context {
192    const llama_model & model;
193
194    const int64_t n_embd_head;
195    const int64_t n_embd_altup;
196    const int64_t n_altup;
197    const int     i_altup_act;
198    const int     n_layer_sparsity = 10; // number of layers using activation sparsity
199    const float   f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
200
201    llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params);
202    ggml_tensor * calc_magnitude(ggml_tensor * x);
203    ggml_tensor * view_2d_slice(ggml_tensor * x, int idx);
204    ggml_tensor * get_per_layer_inputs();
205    ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer);
206    ggml_tensor * gaussian_topk(ggml_tensor * x);
207    ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il);
208    ggml_tensor * altup_predict(ggml_tensor * cur, int il);
209    ggml_tensor * laurel(ggml_tensor * cur, int il);
210    ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il);
211};
212
213struct llm_build_gemma_embedding : public llm_graph_context {
214    llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params);
215};
216
217struct llm_build_gemma : public llm_graph_context {
218    llm_build_gemma(const llama_model & model, const llm_graph_params & params);
219};
220
221struct llm_build_glm4 : public llm_graph_context {
222    llm_build_glm4(const llama_model & model, const llm_graph_params & params);
223};
224
225struct llm_build_glm4_moe : public llm_graph_context {
226    llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params);
227};
228
229struct llm_build_gpt2 : public llm_graph_context {
230    llm_build_gpt2(const llama_model & model, const llm_graph_params & params);
231};
232
233struct llm_build_gptneox : public llm_graph_context {
234    llm_build_gptneox(const llama_model & model, const llm_graph_params & params);
235};
236
237struct llm_build_granite : public llm_graph_context {
238    llm_build_granite(const llama_model & model, const llm_graph_params & params);
239
240private:
241    ggml_tensor * build_attention_layer(
242              ggml_tensor             * cur,
243              ggml_tensor             * inp_pos,
244              llm_graph_input_attn_kv * inp_attn,
245        const llama_model             & model,
246        const int64_t                 n_embd_head,
247        const int                     il);
248
249    ggml_tensor * build_layer_ffn(
250              ggml_tensor       * cur,
251              ggml_tensor       * inpSA,
252        const llama_model       & model,
253        const int                 il);
254};
255
256struct llm_build_granite_hybrid : public llm_graph_context_mamba {
257    llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params);
258    ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il);
259    ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn,
260        const llama_model & model,const int64_t n_embd_head, const int il);
261};
262
263struct llm_build_grok : public llm_graph_context {
264    llm_build_grok(const llama_model & model, const llm_graph_params & params);
265};
266
267struct llm_build_grovemoe : public llm_graph_context {
268    llm_build_grovemoe(const llama_model & model, const llm_graph_params & params);
269};
270
271struct llm_build_hunyuan_dense : public llm_graph_context {
272    llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params);
273};
274
275struct llm_build_hunyuan_moe : public llm_graph_context {
276    llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params);
277};
278
279struct llm_build_internlm2 : public llm_graph_context {
280    llm_build_internlm2(const llama_model & model, const llm_graph_params & params);
281};
282
283struct llm_build_jais : public llm_graph_context {
284    llm_build_jais(const llama_model & model, const llm_graph_params & params);
285};
286
287struct llm_build_jamba : public llm_graph_context_mamba {
288    llm_build_jamba(const llama_model & model, const llm_graph_params & params);
289};
290
291struct llm_build_kimi_linear : public llm_graph_context_mamba {
292    llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
293
294    std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive(
295                ggml_tensor * q,
296                ggml_tensor * k,
297                ggml_tensor * v,
298                ggml_tensor * gk,
299                ggml_tensor * beta,
300                ggml_tensor * state,
301                        int   il);
302
303    std::pair<ggml_tensor *, ggml_tensor *> build_kda_chunking(
304                ggml_tensor * q,
305                ggml_tensor * k,
306                ggml_tensor * v,
307                ggml_tensor * gk,
308                ggml_tensor * beta,
309                ggml_tensor * state,
310                ggml_tensor * causal_mask,
311                ggml_tensor * identity,
312                ggml_tensor * diag_mask,
313                        int   il);
314
315    const llama_model & model;
316};
317
318struct llm_build_lfm2 : public llm_graph_context {
319    const llama_model & model;
320
321    llm_build_lfm2(const llama_model & model, const llm_graph_params & params);
322    ggml_tensor * build_moe_feed_forward(ggml_tensor * cur, int il) const;
323    ggml_tensor * build_dense_feed_forward(ggml_tensor * cur, int il) const;
324    ggml_tensor * build_attn_block(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, int il) const;
325    ggml_tensor * build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il);
326
327};
328
329struct llm_build_llada : public llm_graph_context {
330    llm_build_llada(const llama_model & model, const llm_graph_params & params);
331};
332
333struct llm_build_llada_moe : public llm_graph_context {
334    llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
335};
336
337template <bool embed>
338struct llm_build_llama : public llm_graph_context {
339    llm_build_llama(const llama_model & model, const llm_graph_params & params);
340};
341
342struct llm_build_llama_iswa : public llm_graph_context {
343    llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
344};
345
346struct llm_build_maincoder : public llm_graph_context {
347    llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
348};
349
350struct llm_build_mamba : public llm_graph_context_mamba {
351    llm_build_mamba(const llama_model & model, const llm_graph_params & params);
352};
353
354struct llm_build_mimo2_iswa : public llm_graph_context {
355    llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
356};
357
358struct llm_build_minicpm3 : public llm_graph_context {
359    llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
360};
361
362struct llm_build_minimax_m2 : public llm_graph_context {
363    llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
364};
365
366struct llm_build_mistral3 : public llm_graph_context {
367    llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
368};
369
370struct llm_build_modern_bert : public llm_graph_context {
371    llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
372};
373
374struct llm_build_mpt : public llm_graph_context {
375    llm_build_mpt(const llama_model & model, const llm_graph_params & params);
376};
377
378struct llm_build_nemotron : public llm_graph_context {
379    llm_build_nemotron(const llama_model & model, const llm_graph_params & params);
380};
381
382struct llm_build_nemotron_h : public llm_graph_context_mamba {
383    llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params);
384    ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il);
385    ggml_tensor * build_attention_layer(ggml_tensor * cur, llm_graph_input_attn_kv * inp_attn,
386        const llama_model & model, const int64_t n_embd_head, const int il);
387};
388
389struct llm_build_neo_bert : public llm_graph_context {
390    llm_build_neo_bert(const llama_model & model, const llm_graph_params & params);
391};
392
393template <bool iswa>
394struct llm_build_olmo2 : public llm_graph_context {
395    llm_build_olmo2(const llama_model & model, const llm_graph_params & params);
396};
397
398struct llm_build_olmoe : public llm_graph_context {
399    llm_build_olmoe(const llama_model & model, const llm_graph_params & params);
400};
401
402struct llm_build_olmo : public llm_graph_context {
403    llm_build_olmo(const llama_model & model, const llm_graph_params & params);
404};
405
406struct llm_build_openai_moe_iswa : public llm_graph_context {
407    llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params);
408};
409
410struct llm_build_openelm : public llm_graph_context {
411    llm_build_openelm(const llama_model & model, const llm_graph_params & params);
412};
413
414struct llm_build_orion : public llm_graph_context {
415    llm_build_orion(const llama_model & model, const llm_graph_params & params);
416};
417
418struct llm_build_pangu_embedded : public llm_graph_context {
419    llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params);
420};
421
422struct llm_build_phi2 : public llm_graph_context {
423    llm_build_phi2(const llama_model & model, const llm_graph_params & params);
424};
425
426template<bool iswa>
427struct llm_build_phi3 : public llm_graph_context {
428    llm_build_phi3(const llama_model & model, const llm_graph_params & params);
429};
430
431struct llm_build_plamo2 : public llm_graph_context_mamba {
432    llm_build_plamo2(const llama_model & model, const llm_graph_params & params);
433    private:
434        ggml_tensor * build_plamo2_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
435        ggml_tensor * build_plamo2_attn_layer(llm_graph_input_attn_kv * inp, ggml_tensor * inp_pos, ggml_tensor * cur,
436                                                const llama_model & model, int il);
437};
438
439struct llm_build_plamo : public llm_graph_context {
440    llm_build_plamo(const llama_model & model, const llm_graph_params & params);
441};
442
443template <bool iswa>
444struct llm_build_plamo3 : public llm_graph_context {
445    llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
446};
447
448struct llm_build_plm : public llm_graph_context {
449    llm_build_plm(const llama_model & model, const llm_graph_params & params);
450};
451
452struct llm_build_qwen2 : public llm_graph_context {
453    llm_build_qwen2(const llama_model & model, const llm_graph_params & params);
454};
455
456struct llm_build_qwen2moe : public llm_graph_context {
457    llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params);
458};
459
460struct llm_build_qwen2vl : public llm_graph_context {
461    llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params);
462};
463
464struct llm_build_qwen3 : public llm_graph_context {
465    llm_build_qwen3(const llama_model & model, const llm_graph_params & params);
466};
467
468struct llm_build_qwen3moe : public llm_graph_context {
469    llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params);
470};
471
472struct llm_build_qwen3vl : public llm_graph_context {
473    llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params);
474};
475
476struct llm_build_qwen3vlmoe : public llm_graph_context {
477    llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
478};
479
480struct llm_build_qwen3next : public llm_graph_context_mamba {
481    llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
482private:
483    ggml_tensor * build_layer_attn(
484    llm_graph_input_attn_kv * inp_attn,
485                ggml_tensor * cur,
486                ggml_tensor * inp_pos,
487                        int   il);
488
489    ggml_tensor * build_layer_attn_linear(
490         llm_graph_input_rs * inp,
491                ggml_tensor * cur,
492                ggml_tensor * causal_mask,
493                ggml_tensor * identity,
494                ggml_tensor * diag_mask,
495                        int   il);
496
497    ggml_tensor * build_layer_ffn(
498                ggml_tensor * cur,
499                        int   il);
500
501    // returns pair of output and new state
502    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
503                ggml_tensor * q,
504                ggml_tensor * k,
505                ggml_tensor * v,
506                ggml_tensor * g,
507                ggml_tensor * beta,
508                ggml_tensor * state,
509                ggml_tensor * causal_mask,
510                ggml_tensor * identity,
511                ggml_tensor * diag_mask,
512                        int   il);
513
514    // returns pair of output and new state
515    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
516                ggml_tensor * q,
517                ggml_tensor * k,
518                ggml_tensor * v,
519                ggml_tensor * g,
520                ggml_tensor * beta,
521                ggml_tensor * state,
522                int           il);
523
524    ggml_tensor * build_norm_gated(
525                ggml_tensor * input,
526                ggml_tensor * weights,
527                ggml_tensor * gate,
528                        int   layer);
529
530    // returns pair of qkv, z
531    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
532                ggml_tensor * input,
533                        int   il);
534
535    const llama_model & model;
536};
537
538struct llm_build_qwen35 : public llm_graph_context_mamba {
539    llm_build_qwen35(const llama_model & model, const llm_graph_params & params);
540private:
541    ggml_tensor * build_layer_attn(
542    llm_graph_input_attn_kv * inp_attn,
543                ggml_tensor * cur,
544                ggml_tensor * inp_pos,
545                        int * sections,
546                        int   il);
547
548    ggml_tensor * build_layer_attn_linear(
549         llm_graph_input_rs * inp,
550                ggml_tensor * cur,
551                ggml_tensor * causal_mask,
552                ggml_tensor * identity,
553                ggml_tensor * diag_mask,
554                        int   il);
555
556    ggml_tensor * build_layer_ffn(
557                ggml_tensor * cur,
558                        int   il);
559
560    // returns pair of output and new state
561    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
562                ggml_tensor * q,
563                ggml_tensor * k,
564                ggml_tensor * v,
565                ggml_tensor * g,
566                ggml_tensor * beta,
567                ggml_tensor * state,
568                ggml_tensor * causal_mask,
569                ggml_tensor * identity,
570                ggml_tensor * diag_mask,
571                        int   il);
572
573    // returns pair of output and new state
574    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
575                ggml_tensor * q,
576                ggml_tensor * k,
577                ggml_tensor * v,
578                ggml_tensor * g,
579                ggml_tensor * beta,
580                ggml_tensor * state,
581                int           il);
582
583    ggml_tensor * build_norm_gated(
584                ggml_tensor * input,
585                ggml_tensor * weights,
586                ggml_tensor * gate,
587                        int   layer);
588
589    // returns pair of qkv, z
590    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
591                ggml_tensor * input,
592                        int   il);
593
594    const llama_model & model;
595};
596
597struct llm_build_qwen35moe : public llm_graph_context_mamba {
598    llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params);
599private:
600    ggml_tensor * build_layer_attn(
601    llm_graph_input_attn_kv * inp_attn,
602                ggml_tensor * cur,
603                ggml_tensor * inp_pos,
604                        int * sections,
605                        int   il);
606
607    ggml_tensor * build_layer_attn_linear(
608         llm_graph_input_rs * inp,
609                ggml_tensor * cur,
610                ggml_tensor * causal_mask,
611                ggml_tensor * identity,
612                ggml_tensor * diag_mask,
613                        int   il);
614
615    ggml_tensor * build_layer_ffn(
616                ggml_tensor * cur,
617                        int   il);
618
619    // returns pair of output and new state
620    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
621                ggml_tensor * q,
622                ggml_tensor * k,
623                ggml_tensor * v,
624                ggml_tensor * g,
625                ggml_tensor * beta,
626                ggml_tensor * state,
627                ggml_tensor * causal_mask,
628                ggml_tensor * identity,
629                ggml_tensor * diag_mask,
630                        int   il);
631
632    // returns pair of output and new state
633    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
634                ggml_tensor * q,
635                ggml_tensor * k,
636                ggml_tensor * v,
637                ggml_tensor * g,
638                ggml_tensor * beta,
639                ggml_tensor * state,
640                int           il);
641
642    ggml_tensor * build_norm_gated(
643                ggml_tensor * input,
644                ggml_tensor * weights,
645                ggml_tensor * gate,
646                        int   layer);
647
648    // returns pair of qkv, z
649    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
650                ggml_tensor * input,
651                        int   il);
652
653    const llama_model & model;
654};
655
656struct llm_build_qwen : public llm_graph_context {
657    llm_build_qwen(const llama_model & model, const llm_graph_params & params);
658};
659
660struct llm_build_refact : public llm_graph_context {
661    llm_build_refact(const llama_model & model, const llm_graph_params & params);
662};
663
664struct llm_build_rnd1 : public llm_graph_context {
665    llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
666};
667
668struct llm_build_rwkv6 : public llm_build_rwkv6_base {
669    llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
670};
671
672struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
673    llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params);
674};
675
676struct llm_build_rwkv7 : public llm_build_rwkv7_base {
677    llm_build_rwkv7(const llama_model & model, const llm_graph_params & params);
678};
679
680struct llm_build_seed_oss : public llm_graph_context {
681    llm_build_seed_oss(const llama_model & model, const llm_graph_params & params);
682};
683
684template <bool iswa>
685struct llm_build_smallthinker : public llm_graph_context {
686    llm_build_smallthinker(const llama_model & model, const llm_graph_params & params);
687};
688
689struct llm_build_smollm3 : public llm_graph_context {
690    llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
691};
692
693struct llm_build_stablelm : public llm_graph_context {
694    llm_build_stablelm(const llama_model & model, const llm_graph_params & params);
695};
696
697struct llm_build_starcoder2 : public llm_graph_context {
698    llm_build_starcoder2(const llama_model & model, const llm_graph_params & params);
699};
700
701struct llm_build_starcoder : public llm_graph_context {
702    llm_build_starcoder(const llama_model & model, const llm_graph_params & params);
703};
704
705struct llm_build_step35_iswa : public llm_graph_context {
706    llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params);
707};
708
709struct llm_build_t5_dec : public llm_graph_context {
710    llm_build_t5_dec(const llama_model & model, const llm_graph_params & params);
711};
712
713struct llm_build_t5_enc : public llm_graph_context {
714    llm_build_t5_enc(const llama_model & model, const llm_graph_params & params);
715};
716
717struct llm_build_wavtokenizer_dec : public llm_graph_context {
718    llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params);
719};
720
721struct llm_build_xverse : public llm_graph_context {
722    llm_build_xverse(const llama_model & model, const llm_graph_params & params);
723};