1#pragma once
2
3#include "llama.h"
4#include "llama-arch.h"
5#include "llama-graph.h"
6#include "llama-hparams.h"
7#include "llama-memory.h"
8#include "llama-vocab.h"
9
10#include <map>
11#include <memory>
12#include <string>
13#include <unordered_map>
14#include <unordered_set>
15#include <vector>
16
17struct llama_cparams;
18struct llama_ubatch;
19struct llama_model_loader;
20
21// available models
22enum llm_type {
23 LLM_TYPE_UNKNOWN,
24 LLM_TYPE_14M,
25 LLM_TYPE_17M,
26 LLM_TYPE_22M,
27 LLM_TYPE_33M,
28 LLM_TYPE_47M,
29 LLM_TYPE_60M,
30 LLM_TYPE_70M,
31 LLM_TYPE_80M,
32 LLM_TYPE_109M,
33 LLM_TYPE_137M,
34 LLM_TYPE_140M,
35 LLM_TYPE_149M,
36 LLM_TYPE_160M,
37 LLM_TYPE_190M,
38 LLM_TYPE_220M,
39 LLM_TYPE_250M,
40 LLM_TYPE_256M,
41 LLM_TYPE_270M,
42 LLM_TYPE_335M,
43 LLM_TYPE_350M,
44 LLM_TYPE_360M,
45 LLM_TYPE_395M,
46 LLM_TYPE_410M,
47 LLM_TYPE_450M,
48 LLM_TYPE_475M,
49 LLM_TYPE_558M,
50 LLM_TYPE_700M,
51 LLM_TYPE_770M,
52 LLM_TYPE_780M,
53 LLM_TYPE_950M,
54 LLM_TYPE_0_3B,
55 LLM_TYPE_0_5B,
56 LLM_TYPE_0_6B,
57 LLM_TYPE_1B,
58 LLM_TYPE_1_2B,
59 LLM_TYPE_1_3B,
60 LLM_TYPE_1_4B,
61 LLM_TYPE_1_5B,
62 LLM_TYPE_1_6B,
63 LLM_TYPE_1_7B,
64 LLM_TYPE_1_8B,
65 LLM_TYPE_2B,
66 LLM_TYPE_2_6B,
67 LLM_TYPE_2_8B,
68 LLM_TYPE_2_9B,
69 LLM_TYPE_3B,
70 LLM_TYPE_4B,
71 LLM_TYPE_6B,
72 LLM_TYPE_6_9B,
73 LLM_TYPE_7B,
74 LLM_TYPE_8B,
75 LLM_TYPE_9B,
76 LLM_TYPE_11B,
77 LLM_TYPE_12B,
78 LLM_TYPE_13B,
79 LLM_TYPE_14B,
80 LLM_TYPE_15B,
81 LLM_TYPE_16B,
82 LLM_TYPE_20B,
83 LLM_TYPE_26B,
84 LLM_TYPE_27B,
85 LLM_TYPE_30B,
86 LLM_TYPE_32B,
87 LLM_TYPE_34B,
88 LLM_TYPE_35B,
89 LLM_TYPE_36B,
90 LLM_TYPE_40B,
91 LLM_TYPE_65B,
92 LLM_TYPE_70B,
93 LLM_TYPE_120B,
94 LLM_TYPE_142B,
95 LLM_TYPE_236B,
96 LLM_TYPE_290B,
97 LLM_TYPE_314B,
98 LLM_TYPE_405B,
99 LLM_TYPE_671B,
100 LLM_TYPE_SMALL,
101 LLM_TYPE_MEDIUM,
102 LLM_TYPE_LARGE,
103 LLM_TYPE_XL,
104 LLM_TYPE_A1_7B,
105 LLM_TYPE_A2_7B,
106 LLM_TYPE_8x7B,
107 LLM_TYPE_8x22B,
108 LLM_TYPE_16x12B,
109 LLM_TYPE_16x3_8B,
110 LLM_TYPE_10B_128x3_66B,
111 LLM_TYPE_57B_A14B,
112 LLM_TYPE_17B_16E, // llama4 Scout
113 LLM_TYPE_17B_128E, // llama4 Maverick
114 LLM_TYPE_A13B,
115 LLM_TYPE_7B_A1B,
116 LLM_TYPE_8B_A1B, // lfm2moe
117 LLM_TYPE_16B_A1B,
118 LLM_TYPE_21B_A3B, // Ernie MoE small
119 LLM_TYPE_30B_A3B,
120 LLM_TYPE_31B_A3_5B,
121 LLM_TYPE_35B_A3B, // Qwen3.5
122 LLM_TYPE_48B_A3B, // Kimi Linear
123 LLM_TYPE_80B_A3B, // Qwen3 Next
124 LLM_TYPE_100B_A6B,
125 LLM_TYPE_102B_A12B, // Solar-Open
126 LLM_TYPE_106B_A12B, // GLM-4.5-Air
127 LLM_TYPE_196B_A11B, // Step3.5-Flash
128 LLM_TYPE_230B_A10B, // Minimax M2
129 LLM_TYPE_235B_A22B,
130 LLM_TYPE_300B_A47B, // Ernie MoE big
131 LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
132 LLM_TYPE_355B_A32B, // GLM-4.5
133 LLM_TYPE_E2B,
134 LLM_TYPE_E4B,
135};
136
137std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
138
139struct llama_layer_posnet {
140 // resnet
141 struct ggml_tensor * norm1 = nullptr;
142 struct ggml_tensor * norm1_b = nullptr;
143
144 struct ggml_tensor * conv1 = nullptr;
145 struct ggml_tensor * conv1_b = nullptr;
146
147 struct ggml_tensor * norm2 = nullptr;
148 struct ggml_tensor * norm2_b = nullptr;
149
150 struct ggml_tensor * conv2 = nullptr;
151 struct ggml_tensor * conv2_b = nullptr;
152
153 // attention
154 struct ggml_tensor * attn_norm = nullptr;
155 struct ggml_tensor * attn_norm_b = nullptr;
156
157 struct ggml_tensor * attn_q = nullptr;
158 struct ggml_tensor * attn_q_b = nullptr;
159
160 struct ggml_tensor * attn_k = nullptr;
161 struct ggml_tensor * attn_k_b = nullptr;
162
163 struct ggml_tensor * attn_v = nullptr;
164 struct ggml_tensor * attn_v_b = nullptr;
165
166 struct ggml_tensor * attn_o = nullptr;
167 struct ggml_tensor * attn_o_b = nullptr;
168
169 // normalize
170 struct ggml_tensor * norm = nullptr;
171 struct ggml_tensor * norm_b = nullptr;
172};
173
174struct llama_layer_convnext {
175 struct ggml_tensor * dw = nullptr;
176 struct ggml_tensor * dw_b = nullptr;
177
178 struct ggml_tensor * norm = nullptr;
179 struct ggml_tensor * norm_b = nullptr;
180
181 struct ggml_tensor * pw1 = nullptr;
182 struct ggml_tensor * pw1_b = nullptr;
183
184 struct ggml_tensor * pw2 = nullptr;
185 struct ggml_tensor * pw2_b = nullptr;
186
187 struct ggml_tensor * gamma = nullptr;
188};
189
190struct llama_layer_shortconv {
191 struct ggml_tensor * in_proj = nullptr;
192 struct ggml_tensor * conv = nullptr;
193 struct ggml_tensor * out_proj = nullptr;
194};
195
196struct llama_layer_nextn {
197 struct ggml_tensor * eh_proj = nullptr;
198 struct ggml_tensor * embed_tokens = nullptr;
199 struct ggml_tensor * enorm = nullptr;
200 struct ggml_tensor * hnorm = nullptr;
201 struct ggml_tensor * shared_head_head = nullptr;
202 struct ggml_tensor * shared_head_norm = nullptr;
203};
204
205struct llama_layer {
206 // normalization
207 struct ggml_tensor * attn_norm = nullptr;
208 struct ggml_tensor * attn_norm_b = nullptr;
209 struct ggml_tensor * attn_norm_2 = nullptr;
210 struct ggml_tensor * attn_norm_2_b = nullptr;
211 struct ggml_tensor * attn_q_norm = nullptr;
212 struct ggml_tensor * attn_q_norm_b = nullptr;
213 struct ggml_tensor * attn_k_norm = nullptr;
214 struct ggml_tensor * attn_k_norm_b = nullptr;
215 struct ggml_tensor * attn_out_norm = nullptr;
216 struct ggml_tensor * attn_out_norm_b = nullptr;
217 struct ggml_tensor * attn_q_a_norm = nullptr;
218 struct ggml_tensor * attn_kv_a_norm = nullptr;
219 struct ggml_tensor * attn_sub_norm = nullptr;
220 struct ggml_tensor * attn_post_norm = nullptr;
221 struct ggml_tensor * ffn_sub_norm = nullptr;
222 struct ggml_tensor * attn_norm_cross = nullptr;
223 struct ggml_tensor * attn_norm_enc = nullptr;
224 struct ggml_tensor * ssm_norm = nullptr;
225 struct ggml_tensor * ssm_dt_norm = nullptr;
226 struct ggml_tensor * ssm_b_norm = nullptr;
227 struct ggml_tensor * ssm_c_norm = nullptr;
228
229 // attention
230 struct ggml_tensor * wq = nullptr;
231 struct ggml_tensor * wk = nullptr;
232 struct ggml_tensor * wv = nullptr;
233 struct ggml_tensor * wo = nullptr;
234 struct ggml_tensor * wqkv = nullptr;
235 struct ggml_tensor * wq_a = nullptr;
236 struct ggml_tensor * wq_b = nullptr;
237 struct ggml_tensor * wkv_a_mqa = nullptr;
238 struct ggml_tensor * wkv_b = nullptr;
239 struct ggml_tensor * wk_b = nullptr;
240 struct ggml_tensor * wv_b = nullptr;
241 struct ggml_tensor * wq_cross = nullptr;
242 struct ggml_tensor * wk_cross = nullptr;
243 struct ggml_tensor * wv_cross = nullptr;
244 struct ggml_tensor * wo_cross = nullptr;
245 struct ggml_tensor * wq_enc = nullptr;
246 struct ggml_tensor * wk_enc = nullptr;
247 struct ggml_tensor * wv_enc = nullptr;
248 struct ggml_tensor * wo_enc = nullptr;
249 struct ggml_tensor * wqkv_gate = nullptr;
250
251 // attention bias
252 struct ggml_tensor * bq = nullptr;
253 struct ggml_tensor * bk = nullptr;
254 struct ggml_tensor * bv = nullptr;
255 struct ggml_tensor * bo = nullptr;
256 struct ggml_tensor * bqkv = nullptr;
257
258 // relative position bias
259 struct ggml_tensor * attn_rel_b = nullptr;
260 struct ggml_tensor * attn_rel_b_enc = nullptr;
261 struct ggml_tensor * attn_rel_b_cross = nullptr;
262
263 // normalization
264 struct ggml_tensor * ffn_norm = nullptr;
265 struct ggml_tensor * ffn_norm_b = nullptr;
266 struct ggml_tensor * ffn_post_norm = nullptr;
267 struct ggml_tensor * layer_out_norm = nullptr;
268 struct ggml_tensor * layer_out_norm_b = nullptr;
269 struct ggml_tensor * ffn_norm_exps = nullptr;
270 struct ggml_tensor * ffn_norm_enc = nullptr;
271
272 // ff
273 struct ggml_tensor * ffn_gate = nullptr; // w1
274 struct ggml_tensor * ffn_down = nullptr; // w2
275 struct ggml_tensor * ffn_up = nullptr; // w3
276 struct ggml_tensor * ffn_gate_enc = nullptr;
277 struct ggml_tensor * ffn_down_enc = nullptr;
278 struct ggml_tensor * ffn_up_enc = nullptr;
279
280 // ff MoE
281 struct ggml_tensor * ffn_gate_inp = nullptr;
282 struct ggml_tensor * ffn_gate_exps = nullptr;
283 struct ggml_tensor * ffn_down_exps = nullptr;
284 struct ggml_tensor * ffn_up_exps = nullptr;
285 struct ggml_tensor * ffn_gate_inp_b = nullptr;
286 struct ggml_tensor * ffn_gate_exps_b = nullptr;
287 struct ggml_tensor * ffn_down_exps_b = nullptr;
288 struct ggml_tensor * ffn_up_exps_b = nullptr;
289
290 // ff shared expert (shexp)
291 struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
292 struct ggml_tensor * ffn_gate_shexp = nullptr;
293 struct ggml_tensor * ffn_down_shexp = nullptr;
294 struct ggml_tensor * ffn_up_shexp = nullptr;
295
296 // ff adjugate experts (chexps)
297 struct ggml_tensor * ffn_gate_chexps = nullptr;
298 struct ggml_tensor * ffn_down_chexps = nullptr;
299 struct ggml_tensor * ffn_up_chexps = nullptr;
300
301 // ff bias
302 struct ggml_tensor * ffn_gate_b = nullptr;
303 struct ggml_tensor * ffn_down_b = nullptr; // b2
304 struct ggml_tensor * ffn_up_b = nullptr; // b3
305 struct ggml_tensor * ffn_act = nullptr;
306 struct ggml_tensor * ffn_exp_probs_b = nullptr;
307
308 // mamba proj
309 struct ggml_tensor * ssm_in = nullptr;
310 struct ggml_tensor * ssm_x = nullptr;
311 struct ggml_tensor * ssm_dt = nullptr;
312 struct ggml_tensor * ssm_out = nullptr;
313
314 // mamba
315 struct ggml_tensor * ssm_conv1d = nullptr;
316 struct ggml_tensor * ssm_a = nullptr;
317 struct ggml_tensor * ssm_d = nullptr;
318
319 // mamba bias
320 struct ggml_tensor * ssm_conv1d_b = nullptr;
321 struct ggml_tensor * ssm_dt_b = nullptr;
322
323 // qwen3next
324 struct ggml_tensor * ssm_beta_alpha = nullptr;
325
326 // qwen3.5
327 struct ggml_tensor * ssm_alpha = nullptr;
328
329 // rwkv
330 struct ggml_tensor * time_mix_w1 = nullptr;
331 struct ggml_tensor * time_mix_w2 = nullptr;
332 struct ggml_tensor * time_mix_lerp_x = nullptr;
333 struct ggml_tensor * time_mix_lerp_w = nullptr;
334 struct ggml_tensor * time_mix_lerp_k = nullptr;
335 struct ggml_tensor * time_mix_lerp_v = nullptr;
336 struct ggml_tensor * time_mix_lerp_r = nullptr;
337 struct ggml_tensor * time_mix_lerp_g = nullptr;
338 struct ggml_tensor * time_mix_lerp_fused = nullptr;
339
340 struct ggml_tensor * time_mix_first = nullptr;
341 struct ggml_tensor * time_mix_decay = nullptr;
342 struct ggml_tensor * time_mix_decay_w1 = nullptr;
343 struct ggml_tensor * time_mix_decay_w2 = nullptr;
344 struct ggml_tensor * time_mix_key = nullptr;
345 struct ggml_tensor * time_mix_key_b = nullptr;
346 struct ggml_tensor * time_mix_value = nullptr;
347 struct ggml_tensor * time_mix_value_b = nullptr;
348 struct ggml_tensor * time_mix_receptance = nullptr;
349 struct ggml_tensor * time_mix_receptance_b = nullptr;
350 struct ggml_tensor * time_mix_gate = nullptr;
351
352 // rwkv7
353 struct ggml_tensor * time_mix_w0 = nullptr;
354 struct ggml_tensor * time_mix_a0 = nullptr;
355 struct ggml_tensor * time_mix_a1 = nullptr;
356 struct ggml_tensor * time_mix_a2 = nullptr;
357 struct ggml_tensor * time_mix_v0 = nullptr;
358 struct ggml_tensor * time_mix_v1 = nullptr;
359 struct ggml_tensor * time_mix_v2 = nullptr;
360 struct ggml_tensor * time_mix_g1 = nullptr;
361 struct ggml_tensor * time_mix_g2 = nullptr;
362 struct ggml_tensor * time_mix_k_k = nullptr;
363 struct ggml_tensor * time_mix_k_a = nullptr;
364 struct ggml_tensor * time_mix_r_k = nullptr;
365
366 struct ggml_tensor * time_mix_ln = nullptr;
367 struct ggml_tensor * time_mix_ln_b = nullptr;
368 struct ggml_tensor * time_mix_output = nullptr;
369
370 struct ggml_tensor * channel_mix_lerp_k = nullptr;
371 struct ggml_tensor * channel_mix_lerp_r = nullptr;
372
373 struct ggml_tensor * channel_mix_key = nullptr;
374 struct ggml_tensor * channel_mix_receptance = nullptr;
375 struct ggml_tensor * channel_mix_value = nullptr;
376
377 // long rope factors
378 struct ggml_tensor * rope_long = nullptr;
379 struct ggml_tensor * rope_short = nullptr;
380 struct ggml_tensor * rope_freqs = nullptr;
381
382 // bitnet scale
383 struct ggml_tensor * wq_scale = nullptr;
384 struct ggml_tensor * wk_scale = nullptr;
385 struct ggml_tensor * wv_scale = nullptr;
386 struct ggml_tensor * wo_scale = nullptr;
387 struct ggml_tensor * ffn_gate_scale = nullptr;
388 struct ggml_tensor * ffn_up_scale = nullptr;
389 struct ggml_tensor * ffn_down_scale = nullptr;
390
391 // altup & laurel
392 struct ggml_tensor * per_layer_inp_gate = nullptr;
393 struct ggml_tensor * per_layer_proj = nullptr;
394 struct ggml_tensor * per_layer_post_norm = nullptr;
395 struct ggml_tensor * altup_correct_coef = nullptr;
396 struct ggml_tensor * altup_correct_scale = nullptr;
397 struct ggml_tensor * altup_predict_coef = nullptr;
398 struct ggml_tensor * altup_router = nullptr;
399 struct ggml_tensor * altup_router_norm = nullptr;
400 struct ggml_tensor * laurel_l = nullptr;
401 struct ggml_tensor * laurel_r = nullptr;
402 struct ggml_tensor * laurel_post_norm = nullptr;
403
404 // openai-moe
405 struct ggml_tensor * attn_sinks = nullptr;
406
407 // cogvlm
408 struct ggml_tensor * visexp_attn_wqkv = nullptr;
409 struct ggml_tensor * visexp_attn_wo = nullptr;
410 struct ggml_tensor * visexp_ffn_gate = nullptr;
411 struct ggml_tensor * visexp_ffn_down = nullptr;
412 struct ggml_tensor * visexp_ffn_up = nullptr;
413
414 // xIELU activation parameters for Apertus
415 struct ggml_tensor * ffn_act_alpha_n = nullptr;
416 struct ggml_tensor * ffn_act_alpha_p = nullptr;
417 struct ggml_tensor * ffn_act_beta = nullptr;
418 struct ggml_tensor * ffn_act_eps = nullptr;
419
420 // Kimi Linear KDA (using ssm_ prefix for consistency)
421 // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias
422 struct ggml_tensor * ssm_q_conv = nullptr;
423 struct ggml_tensor * ssm_k_conv = nullptr;
424 struct ggml_tensor * ssm_v_conv = nullptr;
425 struct ggml_tensor * ssm_f_a = nullptr;
426 struct ggml_tensor * ssm_f_b = nullptr;
427 struct ggml_tensor * ssm_beta = nullptr;
428 struct ggml_tensor * ssm_g_a = nullptr;
429 struct ggml_tensor * ssm_g_b = nullptr;
430 struct ggml_tensor * ssm_o_norm = nullptr;
431
432 struct llama_layer_posnet posnet;
433
434 struct llama_layer_convnext convnext;
435
436 struct llama_layer_shortconv shortconv;
437
438 struct llama_layer_nextn nextn;
439};
440
441struct llama_model {
442 llm_type type = LLM_TYPE_UNKNOWN;
443 llm_arch arch = LLM_ARCH_UNKNOWN;
444
445 std::string name = "n/a";
446
447 llama_hparams hparams = {};
448 llama_vocab vocab;
449
450 // for classifier models
451 std::vector<std::string> classifier_labels;
452
453 struct ggml_tensor * tok_embd = nullptr;
454 struct ggml_tensor * type_embd = nullptr;
455 struct ggml_tensor * pos_embd = nullptr;
456 struct ggml_tensor * tok_norm = nullptr;
457 struct ggml_tensor * tok_norm_b = nullptr;
458
459 struct ggml_tensor * output_norm = nullptr;
460 struct ggml_tensor * output_norm_b = nullptr;
461 struct ggml_tensor * output = nullptr;
462 struct ggml_tensor * output_b = nullptr;
463 struct ggml_tensor * output_norm_enc = nullptr;
464
465 // classifier
466 struct ggml_tensor * cls = nullptr;
467 struct ggml_tensor * cls_b = nullptr;
468 struct ggml_tensor * cls_out = nullptr;
469 struct ggml_tensor * cls_out_b = nullptr;
470
471 struct ggml_tensor * conv1d = nullptr;
472 struct ggml_tensor * conv1d_b = nullptr;
473
474 // gemma3n altup
475 struct ggml_tensor * tok_embd_per_layer = nullptr;
476 struct ggml_tensor * altup_proj = nullptr;
477 struct ggml_tensor * altup_unembd_proj = nullptr;
478 struct ggml_tensor * per_layer_model_proj = nullptr;
479 struct ggml_tensor * per_layer_proj_norm = nullptr;
480
481 std::vector<llama_layer> layers;
482
483 //Dense linear projections for SentenceTransformers models like embeddinggemma
484 // For Sentence Transformers models structure see
485 // https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
486 struct ggml_tensor * dense_2_out_layers = nullptr;
487 struct ggml_tensor * dense_3_out_layers = nullptr;
488
489 // gguf metadata
490 std::unordered_map<std::string, std::string> gguf_kv;
491
492 // list of devices used in this model
493 std::vector<ggml_backend_dev_t> devices;
494
495 // for quantize-stats only
496 std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
497
498 // for keeping track of associated LoRA adapters
499 std::unordered_set<llama_adapter_lora *> loras;
500
501 int64_t t_load_us = 0;
502 int64_t t_start_us = 0;
503
504 explicit llama_model(const struct llama_model_params & params);
505 ~llama_model();
506
507 void load_stats (llama_model_loader & ml);
508 void load_arch (llama_model_loader & ml);
509 void load_hparams(llama_model_loader & ml);
510 void load_vocab (llama_model_loader & ml);
511 bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
512
513 std::string arch_name() const;
514 std::string type_name() const;
515
516 std::string desc() const;
517
518 size_t size() const; // file size
519 size_t n_tensors() const;
520 size_t n_devices() const;
521
522 uint32_t n_gpu_layers() const;
523 llama_split_mode split_mode() const;
524
525 std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
526
527 // total number of parameters in the model
528 uint64_t n_elements() const;
529
530 void print_info() const;
531
532 ggml_backend_dev_t dev_layer(int il) const;
533 ggml_backend_dev_t dev_output() const;
534
535 ggml_backend_buffer_type_t select_buft(int il) const;
536
537 bool has_tensor_overrides() const;
538
539 const struct ggml_tensor * get_tensor(const char * name) const;
540
541 float get_rope_freq_base (const llama_cparams & cparams, int il) const;
542 float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
543
544 ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
545
546 // TODO: move this to new llm_arch_model_i interface
547 llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
548
549 // TODO: move this to new llm_arch_model_i interface
550 ggml_cgraph * build_graph(const llm_graph_params & params) const;
551
552private:
553 llama_model_params params;
554
555 struct impl;
556 std::unique_ptr<impl> pimpl;
557};
558
559const char * llm_type_name(llm_type type);
560
561// For internal test use
562// TODO: remove
563const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);