1#pragma once
2
3#include "llama.h"
4
5#include <array>
6#include <cassert>
7
8// bump if necessary
9#define LLAMA_MAX_LAYERS 512
10#define LLAMA_MAX_EXPERTS 512 // Qwen3 Next
11
12enum llama_expert_gating_func_type {
13 LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
14 LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
15 LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
16 LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
17};
18
19enum llama_swa_type {
20 LLAMA_SWA_TYPE_NONE = 0,
21 LLAMA_SWA_TYPE_STANDARD = 1,
22 LLAMA_SWA_TYPE_CHUNKED = 2,
23 LLAMA_SWA_TYPE_SYMMETRIC = 3,
24};
25
26struct llama_hparams_posnet {
27 uint32_t n_embd;
28 uint32_t n_layer;
29};
30
31struct llama_hparams_convnext {
32 uint32_t n_embd;
33 uint32_t n_layer;
34};
35
36struct llama_hparams {
37 bool vocab_only;
38 bool no_alloc;
39 bool rope_finetuned;
40 bool use_par_res;
41 bool swin_norm;
42
43 uint32_t n_ctx_train; // context size the model was trained on
44 uint32_t n_embd;
45 uint32_t n_layer;
46 int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
47 uint32_t n_rot;
48 uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
49 uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
50 uint32_t n_expert = 0;
51 uint32_t n_expert_used = 0;
52 uint32_t n_rel_attn_bkts = 0;
53
54 // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
55 uint32_t n_embd_head_k_mla_impl = 0;
56 uint32_t n_embd_head_v_mla_impl = 0;
57
58 // for WavTokenizer
59 struct llama_hparams_posnet posnet;
60 struct llama_hparams_convnext convnext;
61
62 uint32_t n_shortconv_l_cache = 0;
63
64 std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
65 std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
66 std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
67
68 uint32_t n_layer_dense_lead = 0;
69 uint32_t n_lora_q = 0;
70 uint32_t n_lora_kv = 0;
71 uint32_t n_ff_exp = 0;
72 uint32_t n_ff_shexp = 0;
73 uint32_t n_ff_chexp = 0;
74 uint32_t n_expert_shared = 0;
75 uint32_t n_norm_groups = 0;
76 uint32_t n_expert_groups = 0;
77 uint32_t n_group_used = 0;
78 uint32_t n_group_experts = 0;
79
80 float expert_group_scale = 0.05f;
81 float expert_weights_scale = 0.0f;
82 bool expert_weights_norm = false;
83 uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
84 uint32_t moe_every_n_layers = 0;
85 uint32_t nextn_predict_layers = 0;
86
87 float f_norm_eps;
88 float f_norm_rms_eps;
89 float f_norm_group_eps;
90
91 float f_attn_logit_softcapping = 50.0f;
92 float f_router_logit_softcapping = 30.0f;
93 float f_final_logit_softcapping = 30.0f;
94
95 // for RWKV
96 uint32_t rescale_every_n_layers = 0;
97 uint32_t time_mix_extra_dim = 0;
98 uint32_t time_decay_extra_dim = 0;
99 uint32_t wkv_head_size = 0;
100 uint32_t token_shift_count = 2;
101 uint32_t n_lora_decay = 0;
102 uint32_t n_lora_iclr = 0;
103 uint32_t n_lora_value_res_mix = 0;
104 uint32_t n_lora_gate = 0;
105
106 float rope_attn_factor = 1.0f;
107 float rope_freq_base_train;
108 float rope_freq_base_train_swa = 10000.0f;
109 float rope_freq_scale_train;
110 float rope_freq_scale_train_swa = 1.0f;
111
112 uint32_t n_ctx_orig_yarn;
113 float rope_yarn_log_mul = 0.0f;
114
115 float yarn_ext_factor = -1.0f;
116 float yarn_attn_factor = 1.0f;
117 float yarn_beta_fast = 32.0f;
118 float yarn_beta_slow = 1.0f;
119
120 std::array<int, 4> rope_sections;
121
122 // Sliding Window Attention (SWA)
123 llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
124 // the size of the sliding window (0 - no SWA)
125 uint32_t n_swa = 0;
126 // if swa_layers[il] == 1, then layer il is SWA
127 // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
128 // by default, all layers are dense
129 // note: using uint32_t type for compatibility reason
130 std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
131
132 // for State Space Models
133 uint32_t ssm_d_conv = 0;
134 uint32_t ssm_d_inner = 0;
135 uint32_t ssm_d_state = 0;
136 uint32_t ssm_dt_rank = 0;
137 uint32_t ssm_n_group = 0;
138
139 // for Kimi Linear KDA
140 uint32_t n_embd_head_kda = 0;
141
142 // for hybrid state space models
143 std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
144
145 bool ssm_dt_b_c_rms = false;
146
147 float f_clamp_kqv = 0.0f;
148 float f_max_alibi_bias = 0.0f;
149 float f_logit_scale = 0.0f;
150
151 // Additional scale factors (Granite/Granite MoE)
152 float f_residual_scale = 0.0f;
153 float f_embedding_scale = 0.0f;
154 float f_attention_scale = 0.0f;
155
156 // grok-2
157 float f_attn_out_scale = 0.0f;
158 uint32_t attn_temp_length = 0;
159
160 bool causal_attn = true;
161 bool use_alibi = false;
162 bool attn_soft_cap = false;
163 bool use_kq_norm = false;
164
165 // for Classifiers
166 uint32_t n_cls_out = 1;
167
168 // output embedding dimension (0 = use n_embd)
169 uint32_t n_embd_out_impl = 0;
170
171 // llama4 smallthinker
172 uint32_t n_moe_layer_step = 0;
173 uint32_t n_no_rope_layer_step = 4;
174 uint32_t n_attn_temp_floor_scale = 0;
175 float f_attn_temp_scale = 0.0f;
176 float f_attn_temp_offset = 0.0f; // offset position index
177
178 // gemma3n altup
179 uint32_t n_altup = 4; // altup_num_inputs
180 uint32_t i_altup_act = 0; // altup_active_idx
181 uint32_t laurel_rank = 64;
182 uint32_t n_embd_altup = 256;
183
184 // needed for sentence-transformers dense layers
185 uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense
186 uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense
187 uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense
188 uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense
189
190 // xIELU
191 std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n;
192 std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p;
193 std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
194 std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
195
196 // qwen3vl deepstack
197 uint32_t n_deepstack_layers = 0;
198
199 // needed by encoder-decoder models (e.g. T5, FLAN-T5)
200 // ref: https://github.com/ggml-org/llama.cpp/pull/8141
201 llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
202 uint32_t dec_n_layer = 0;
203
204 enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
205 enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
206 enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
207
208
209 // Step35: optional per-layer clamps for (Swi)GLU
210 std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_exp; // clamping for expert FFN
211 std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_shexp; // shared expert
212
213 // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
214 // dense_first means whether the pattern is start with a dense layer
215 // note that if n_pattern == 0, all layers are SWA
216 // if n_pattern == 1, all layers are dense
217 // example 1: n_pattern = 3, dense_first = false
218 // il == 0: swa
219 // il == 1: swa
220 // il == 2: dense
221 // il == 3: swa
222 // il == 4: swa
223 // il == 5: dense
224 // il == 6: swa
225 // etc ...
226 // example 2: n_pattern = 2, dense_first = true
227 // il == 0: dense
228 // il == 1: swa
229 // il == 2: dense
230 // il == 3: swa
231 // etc ...
232 void set_swa_pattern(uint32_t n_pattern, bool dense_first = false);
233
234 // return true if one of the layers is SWA
235 bool is_swa_any() const;
236
237 uint32_t n_head(uint32_t il = 0) const;
238
239 uint32_t n_head_kv(uint32_t il = 0) const;
240
241 uint32_t n_ff(uint32_t il = 0) const;
242
243 uint32_t n_gqa(uint32_t il = 0) const;
244
245 // dimension of main + auxiliary input embeddings
246 uint32_t n_embd_inp() const;
247
248 // dimension of output embeddings
249 uint32_t n_embd_out() const;
250
251 // dimension of key embeddings across all k-v heads
252 uint32_t n_embd_k_gqa(uint32_t il = 0) const;
253
254 // dimension of value embeddings across all k-v heads
255 uint32_t n_embd_v_gqa(uint32_t il = 0) const;
256
257 // true if any layer has a different n_embd_k_gqa/n_embd_v_gqa
258 bool is_n_embd_k_gqa_variable() const;
259 bool is_n_embd_v_gqa_variable() const;
260
261 // return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers
262 uint32_t n_embd_k_gqa_max() const;
263 uint32_t n_embd_v_gqa_max() const;
264
265 // dimension of the rolling state embeddings
266 // corresponds to Mamba's conv_states size or RWKV's token_shift states size
267 uint32_t n_embd_r() const;
268
269 // dimension of the recurrent state embeddings
270 uint32_t n_embd_s() const;
271
272 // whether or not the given layer is recurrent (for hybrid models)
273 bool is_recurrent(uint32_t il) const;
274
275 uint32_t n_pos_per_embd() const;
276
277 bool is_swa(uint32_t il) const;
278
279 // note: currently only support if either all or none of the layers are MLA
280 bool is_mla() const;
281
282 uint32_t n_embd_head_k_mla() const;
283 uint32_t n_embd_head_v_mla() const;
284
285 bool has_kv(uint32_t il) const;
286
287 // number of layers for which has_kv() returns true
288 uint32_t n_layer_kv() const;
289
290 // note that this function uses different SWA parameters from those in the hparams
291 // note: inlined on purpose for performance reasons
292 // TODO: think of a better place for this function
293 // TODO: pack the SWA params in a struct?
294 static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
295 assert(p0 >= 0 && p1 >= 0);
296
297 switch (swa_type) {
298 case LLAMA_SWA_TYPE_NONE:
299 {
300 } break;
301 case LLAMA_SWA_TYPE_STANDARD:
302 {
303 if (p1 - p0 >= (int32_t) n_swa) {
304 return true;
305 }
306 } break;
307 case LLAMA_SWA_TYPE_CHUNKED:
308 {
309 const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
310
311 if (p0 < pos_chunk_start) {
312 return true;
313 }
314 } break;
315 case LLAMA_SWA_TYPE_SYMMETRIC:
316 {
317 const int32_t half_n_swa = (int32_t) n_swa / 2;
318 const int32_t pos_diff = p1 - p0;
319
320 // Mask if outside the symmetric window
321 if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
322 return true;
323 }
324 } break;
325 }
326
327 return false;
328 }
329
330
331 bool use_mrope() const;
332};
333
334static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");