1from __future__ import annotations
2
3from typing import Sequence
4
5from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES
6
7
8class TensorNameMap:
9 mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
10 # Token embeddings
11 MODEL_TENSOR.TOKEN_EMBD: (
12 "gpt_neox.embed_in", # gptneox
13 "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
14 "transformer.word_embeddings", # falcon
15 "word_embeddings", # bloom
16 "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 plamo2 granite-hybrid
17 "embed_tokens", # embeddinggemma
18 "tok_embeddings", # llama-pth
19 "embeddings.word_embeddings", # bert nomic-bert
20 "embeddings.tok_embeddings", # modern-bert
21 "language_model.embedding.word_embeddings", # persimmon
22 "wte", # gpt2
23 "transformer.embd.wte", # phi2
24 "model.tok_embeddings", # internlm2
25 "model.embedding", # mamba-qbert
26 "backbone.embedding", # mamba
27 "backbone.embeddings", # mamba-hf
28 "transformer.in_out_embed", # Grok
29 "embedding.word_embeddings", # chatglm
30 "transformer.token_embeddings", # openelm
31 "shared", # t5
32 "rwkv.embeddings", # rwkv6
33 "model.embeddings", # rwkv7
34 "model.word_embeddings", # bailingmoe
35 "language_model.model.embed_tokens", # llama4
36 "encoder", # neobert
37 "model.transformer.wte", # llada
38 "embed_tokens", # qwen3-embedding
39 ),
40
41 # Token type embeddings
42 MODEL_TENSOR.TOKEN_TYPES: (
43 "embeddings.token_type_embeddings", # bert nomic-bert
44 ),
45
46 # Normalization of token embeddings
47 MODEL_TENSOR.TOKEN_EMBD_NORM: (
48 "word_embeddings_layernorm", # bloom
49 "embeddings.LayerNorm", # bert
50 "embeddings.norm", # modern-bert
51 "emb_ln", # nomic-bert
52 "transformer.norm", # openelm
53 "rwkv.blocks.0.pre_ln", # rwkv
54 "rwkv.blocks.0.pre_ln", # rwkv6
55 "model.pre_ln", # rwkv7
56 "model.layers.0.pre_norm", # rwkv7
57 "backbone.norm", # wavtokenizer
58 "model.embedding_norm", # lfm2
59 ),
60
61 # Position embeddings
62 MODEL_TENSOR.POS_EMBD: (
63 "transformer.wpe", # gpt2
64 "embeddings.position_embeddings", # bert
65 "wpe", # gpt2
66 ),
67
68 # Output
69 MODEL_TENSOR.OUTPUT: (
70 "embed_out", # gptneox
71 "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe plamo2
72 "output", # llama-pth bloom internlm2
73 "word_embeddings_for_head", # persimmon
74 "lm_head.linear", # phi2
75 "output_layer", # chatglm
76 "head", # rwkv
77 "head.out", # wavtokenizer
78 "lm_head", # llama4
79 "model.transformer.ff_out", # llada
80 "head.decoder", # modern-bert
81 ),
82 MODEL_TENSOR.DENSE_2_OUT: (
83 "dense_2_out", # embeddinggemma
84 ),
85 MODEL_TENSOR.DENSE_3_OUT: (
86 "dense_3_out", # embeddinggemma
87 ),
88 # Output norm
89 MODEL_TENSOR.OUTPUT_NORM: (
90 "gpt_neox.final_layer_norm", # gptneox
91 "transformer.ln_f", # gpt2 gpt-j falcon jais exaone
92 "model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe plamo2
93 "norm", # llama-pth
94 "transformer.norm_f", # mpt dbrx
95 "ln_f", # refact bloom qwen gpt2
96 "language_model.encoder.final_layernorm", # persimmon
97 "model.final_layernorm", # persimmon
98 "lm_head.ln", # phi2
99 "model.norm_f", # mamba-qbert
100 "backbone.norm_f", # mamba
101 "transformer.rms_norm", # Grok
102 "encoder.final_layernorm", # chatglm
103 "transformer.norm", # openelm
104 "model.norm", # nemotron
105 "rwkv.ln_out", # rwkv6
106 "model.ln_out", # rwkv7
107 "backbone.final_layer_norm", # wavtokenizer
108 "model.norm", # llama4
109 "model.transformer.ln_f", # llada
110 "final_norm", # modern-bert
111 "model.norm", # cogvlm
112 ),
113
114 # Rope frequencies
115 MODEL_TENSOR.ROPE_FREQS: (
116 "rope.freqs", # llama-pth
117 "rotary_pos_emb.inv_freq", # chatglm
118 ),
119
120 MODEL_TENSOR.ROPE_FACTORS_LONG: (),
121 MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
122
123 MODEL_TENSOR.CONV1D: (
124 "backbone.embed", # roberta
125 ),
126
127 MODEL_TENSOR.V_MM_EMBEDDING: (
128 "model.embed_vision.embedding", # gemma3n
129 ),
130 MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
131 "model.embed_vision.hard_embedding_norm", # gemma3n
132 ),
133 MODEL_TENSOR.V_MM_INP_PROJ: (
134 "model.embed_vision.embedding_projection", # gemma3n
135 ),
136 MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
137 "model.embed_vision.soft_embedding_norm", # gemma3n
138 ),
139 MODEL_TENSOR.V_ENC_CONV_STEM: (
140 "model.vision_tower.timm_model.conv_stem.conv", # gemma3n
141 ),
142 MODEL_TENSOR.V_ENC_CONV_STEM_NORM: (
143 "model.vision_tower.timm_model.conv_stem.bn", # gemma3n
144 ),
145 MODEL_TENSOR.V_ENC_MSFA_EXP: (
146 "model.vision_tower.timm_model.msfa.ffn.pw_exp.conv", # gemma3n
147 ),
148 MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: (
149 "model.vision_tower.timm_model.msfa.ffn.pw_exp.bn", # gemma3n
150 ),
151 MODEL_TENSOR.V_ENC_MSFA_PROJ: (
152 "model.vision_tower.timm_model.msfa.ffn.pw_proj.conv", # gemma3n
153 ),
154 MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: (
155 "model.vision_tower.timm_model.msfa.ffn.pw_proj.bn", # gemma3n
156 ),
157 MODEL_TENSOR.V_ENC_MSFA_NORM: (
158 "model.vision_tower.timm_model.msfa.norm", # gemma3n
159 ),
160 }
161
162 block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
163 # Attention norm
164 MODEL_TENSOR.ATTN_NORM: (
165 "gpt_neox.layers.{bid}.input_layernorm", # gptneox
166 "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais exaone
167 "transformer.blocks.{bid}.norm_1", # mpt
168 "transformer.h.{bid}.input_layernorm", # falcon7b
169 "h.{bid}.input_layernorm", # bloom
170 "transformer.h.{bid}.ln_mlp", # falcon40b
171 "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe phimoe granite-hybrid
172 "layers.{bid}.attention_norm", # llama-pth
173 "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
174 "model.layers.{bid}.ln1", # yi
175 "h.{bid}.ln_1", # gpt2
176 "transformer.h.{bid}.ln", # phi2
177 "model.layers.layers.{bid}.norm", # plamo
178 "model.layers.layers.{bid}.pre_mixer_norm", # plamo2
179 "model.layers.{bid}.attention_norm", # internlm2
180 "model.layers.{bid}.norm", # mamba-qbert
181 "backbone.layers.{bid}.norm", # mamba
182 "transformer.decoder_layer.{bid}.rms_norm", # Grok
183 "model.layers.{bid}.pre_attn_norm", # grok-2
184 "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
185 "encoder.layers.{bid}.input_layernorm", # chatglm
186 "transformer.layers.{bid}.attn_norm", # openelm
187 "rwkv.blocks.{bid}.ln1", # rwkv6
188 "model.layers.{bid}.ln1", # rwkv7
189 "model.layers.{bid}.input_layernorm", # llama4
190 "layers.{bid}.input_layernorm", # embeddinggemma
191 "transformer_encoder.{bid}.attention_norm", # neobert
192 "layers.{bid}.attn_norm", # modern-bert
193 "model.layers.{bid}.operator_norm", # lfm2
194 "model.transformer.blocks.{bid}.attn_norm", # llada
195 "layers.{bid}.input_layernorm", # qwen3-embedding
196 "model.layers.{bid}.attention_layernorm", # apertus
197 "model.layers.{bid}.pre_attention_layernorm", # kormo
198 ),
199
200 # Attention norm 2
201 MODEL_TENSOR.ATTN_NORM_2: (
202 "transformer.h.{bid}.ln_attn", # falcon40b
203 "encoder.layer.{bid}.layer_norm_1", # jina-v2-code
204 "rwkv.blocks.{bid}.ln2", # rwkv6
205 "model.layers.{bid}.ln2", # rwkv7
206 "model.layers.{bid}.post_attention_layernorm", # cogvlm
207 ),
208
209 # Attention query-key-value
210 MODEL_TENSOR.ATTN_QKV: (
211 "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
212 "transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais
213 "transformer.blocks.{bid}.attn.Wqkv", # mpt
214 "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx
215 "transformer.h.{bid}.self_attention.query_key_value", # falcon
216 "h.{bid}.self_attention.query_key_value", # bloom
217 "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
218 "model.layers.{bid}.self_attn.query_key_value", # persimmon
219 "model.layers.{bid}.attention.query_key_value", # bailingmoe2
220 "h.{bid}.attn.c_attn", # gpt2
221 "transformer.h.{bid}.mixer.Wqkv", # phi2
222 "encoder.layers.{bid}.attn.Wqkv", # nomic-bert
223 "encoder.layers.{bid}.mixer.Wqkv", # jina
224 "model.layers.{bid}.self_attn.qkv_proj", # phi3
225 "model.layers.layers.{bid}.mixer.qkv_proj", # plamo2
226 "encoder.layers.{bid}.self_attention.query_key_value", # chatglm
227 "transformer.layers.{bid}.attn.qkv_proj", # openelm
228 "transformer_encoder.{bid}.qkv", # neobert
229 "layers.{bid}.attn.Wqkv", # modern-bert
230 "model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm
231 "model.layers.{bid}.linear_attn.in_proj_qkv", # qwen3.5
232 ),
233
234 # Attention query
235 MODEL_TENSOR.ATTN_Q: (
236 "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 phimoe
237 "layers.{bid}.self_attn.q_proj", # embeddinggemma
238 "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
239 "layers.{bid}.attention.wq", # llama-pth
240 "encoder.layer.{bid}.attention.self.query", # bert
241 "transformer.layer.{bid}.attention.q_lin", # distillbert
242 "transformer.h.{bid}.attn.q_proj", # gpt-j
243 "model.layers.layers.{bid}.self_attn.q_proj", # plamo
244 "model.layers.{bid}.attention.wq", # internlm2
245 "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
246 "transformer.h.{bid}.attn.attention.q_proj", # exaone
247 "model.layers.{bid}.self_attn.q_proj", # llama4
248 "model.transformer.blocks.{bid}.q_proj", # llada
249 "layers.{bid}.self_attn.q_proj", # qwen3-embedding
250 "backbone.layers.{bid}.mixer.q_proj", # nemotron-h
251 ),
252
253 # Attention key
254 MODEL_TENSOR.ATTN_K: (
255 "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 phimoe
256 "layers.{bid}.self_attn.k_proj", # embeddinggemma
257 "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
258 "layers.{bid}.attention.wk", # llama-pth
259 "encoder.layer.{bid}.attention.self.key", # bert
260 "transformer.layer.{bid}.attention.k_lin", # distillbert
261 "transformer.h.{bid}.attn.k_proj", # gpt-j
262 "transformer.h.{bid}.attn.k", # refact
263 "model.layers.layers.{bid}.self_attn.k_proj", # plamo
264 "model.layers.{bid}.attention.wk", # internlm2
265 "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
266 "transformer.h.{bid}.attn.attention.k_proj", # exaone
267 "model.layers.{bid}.self_attn.k_proj", # llama4
268 "model.transformer.blocks.{bid}.k_proj", # llada
269 "layers.{bid}.self_attn.k_proj", # qwen3-embedding
270 "backbone.layers.{bid}.mixer.k_proj", # nemotron-h
271 ),
272
273 # Attention value
274 MODEL_TENSOR.ATTN_V: (
275 "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe
276 "layers.{bid}.self_attn.v_proj", # embeddinggemma
277 "layers.{bid}.attention.wv", # llama-pth
278 "encoder.layer.{bid}.attention.self.value", # bert
279 "transformer.layer.{bid}.attention.v_lin", # distillbert
280 "transformer.h.{bid}.attn.v_proj", # gpt-j
281 "transformer.h.{bid}.attn.v", # refact
282 "model.layers.layers.{bid}.self_attn.v_proj", # plamo
283 "model.layers.{bid}.attention.wv", # internlm2
284 "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
285 "transformer.h.{bid}.attn.attention.v_proj", # exaone
286 "model.layers.{bid}.self_attn.v_proj", # llama4
287 "model.transformer.blocks.{bid}.v_proj", # llada
288 "layers.{bid}.self_attn.v_proj", # qwen3-embedding
289 "backbone.layers.{bid}.mixer.v_proj", # nemotron-h
290 ),
291
292 # Attention output
293 MODEL_TENSOR.ATTN_OUT: (
294 "gpt_neox.layers.{bid}.attention.dense", # gptneox
295 "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais
296 "transformer.blocks.{bid}.attn.out_proj", # mpt
297 "transformer.h.{bid}.self_attention.dense", # falcon
298 "h.{bid}.self_attention.dense", # bloom
299 "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe
300 "layers.{bid}.self_attn.o_proj", # embeddinggemma
301 "model.layers.{bid}.self_attn.out_proj", # lfm2
302 "model.layers.{bid}.self_attn.linear_attn", # deci
303 "layers.{bid}.attention.wo", # llama-pth
304 "encoder.layer.{bid}.attention.output.dense", # bert
305 "layers.{bid}.attn.Wo", # modern-bert
306 "transformer.layer.{bid}.attention.out_lin", # distillbert
307 "transformer.h.{bid}.attn.out_proj", # gpt-j
308 "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
309 "model.layers.{bid}.self_attn.dense", # persimmon
310 "model.layers.{bid}.attention.dense", # bailingmoe2
311 "h.{bid}.attn.c_proj", # gpt2
312 "transformer.h.{bid}.mixer.out_proj", # phi2
313 "model.layers.layers.{bid}.self_attn.o_proj", # plamo
314 "model.layers.layers.{bid}.mixer.o_proj", # plamo2
315 "model.layers.{bid}.attention.wo", # internlm2
316 "encoder.layers.{bid}.attn.out_proj", # nomic-bert
317 "encoder.layers.{bid}.mixer.out_proj", # jina
318 "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
319 "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
320 "encoder.layers.{bid}.self_attention.dense", # chatglm
321 "transformer.layers.{bid}.attn.out_proj", # openelm
322 "transformer.h.{bid}.attn.attention.out_proj", # exaone
323 "model.layers.{bid}.self_attn.o_proj", # llama4
324 "transformer_encoder.{bid}.wo", # neobert
325 "model.transformer.blocks.{bid}.attn_out", # llada
326 "layers.{bid}.self_attn.o_proj", # qwen3-embedding
327 "backbone.layers.{bid}.mixer.o_proj", # nemotron-h
328 "model.layers.{bid}.self_attn.language_expert_dense", # cogvlm
329 ),
330
331 # Attention output norm
332 MODEL_TENSOR.ATTN_OUT_NORM: (
333 "encoder.layer.{bid}.attention.output.LayerNorm", # bert
334 "transformer.layer.{bid}.sa_layer_norm", # distillbert
335 "encoder.layers.{bid}.norm1", # nomic-bert
336 "transformer.decoder_layer.{bid}.rms_norm_1", # Grok
337 "model.layers.{bid}.post_attn_norm", # grok-2
338 "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
339 ),
340
341 MODEL_TENSOR.ATTN_POST_NORM: (
342 "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
343 "layers.{bid}.post_attention_layernorm", # embeddinggemma
344 "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
345 "model.layers.layers.{bid}.post_mixer_norm.weight", # plamo2
346 ),
347
348 # Rotary embeddings
349 MODEL_TENSOR.ATTN_ROT_EMBD: (
350 "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
351 "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
352 "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
353 "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
354 ),
355
356 MODEL_TENSOR.ATTN_SINKS: (
357 "model.layers.{bid}.self_attn.sinks", # openai-moe
358 "model.layers.{bid}.self_attn.attention_sink_bias", # mimov2
359 ),
360
361 MODEL_TENSOR.ATTN_GATE: (
362 "model.layers.{bid}.self_attn.gate_proj", # afmoe
363 "model.layers.{bid}.linear_attn.in_proj_z", # qwen3.5
364 "model.layers.{bid}.self_attn.g_proj", # step3.5 head-wise attention gate
365 ),
366
367 # Feed-forward norm
368 MODEL_TENSOR.FFN_NORM: (
369 "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
370 "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone
371 "h.{bid}.post_attention_layernorm", # bloom
372 "transformer.blocks.{bid}.norm_2", # mpt
373 "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe phimoe
374 "layers.{bid}.ffn_norm", # llama-pth
375 "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
376 "model.layers.{bid}.ln2", # yi
377 "h.{bid}.ln_2", # gpt2
378 "model.layers.{bid}.ffn_norm", # internlm2
379 "transformer.decoder_layer.{bid}.rms_norm_2", # Grok
380 "model.layers.{bid}.pre_moe_norm", # grok-2
381 "encoder.layers.{bid}.post_attention_layernorm", # chatglm
382 "transformer.layers.{bid}.ffn_norm", # openelm
383 "model.layers.{bid}.pre_ff_layernorm", # jamba granite-hybrid
384 "model.layers.{bid}.pre_moe_layernorm", # mini-jamba
385 "model.layers.{bid}.post_attention_layernorm", # llama4
386 "transformer_encoder.{bid}.ffn_norm", # neobert
387 "model.layers.layers.{bid}.pre_mlp_norm", # plamo2
388 "model.transformer.blocks.{bid}.ff_norm", # llada
389 "layers.{bid}.post_attention_layernorm", # qwen3-embedding
390 "model.layers.{bid}.feedforward_layernorm", # apertus
391 "model.layers.{bid}.pre_mlp_layernorm", # kormo
392 "layers.{bid}.mlp_norm" # modern-bert
393 ),
394
395 # Pre feed-forward norm
396 MODEL_TENSOR.FFN_PRE_NORM: (
397 "model.layers.{bid}.pre_feedforward_layernorm", # gemma2
398 "layers.{bid}.pre_feedforward_layernorm", # embeddinggemma
399 "model.layers.{bid}.pre_ff_layernorm.weight",
400 "model.layers.{bid}.pre_mlp_layernorm", # afmoe
401 ),
402
403 # Post feed-forward norm
404 MODEL_TENSOR.FFN_POST_NORM: (
405 "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
406 "layers.{bid}.post_feedforward_layernorm", # embeddinggemma
407 "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
408 "model.layers.layers.{bid}.post_mlp_norm.weight", # plamo2
409 "model.layers.{bid}.feed_forward.up_proj",
410 "model.layers.{bid}.post_moe_norm", # grok-2
411 ),
412
413 MODEL_TENSOR.FFN_GATE_INP: (
414 "layers.{bid}.feed_forward.gate", # mixtral
415 "model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe
416 "model.layers.{bid}.mlp.gate", # qwen2moe olmoe
417 "transformer.decoder_layer.{bid}.router", # Grok
418 "transformer.blocks.{bid}.ffn.router.layer", # dbrx
419 "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
420 "model.layers.{bid}.feed_forward.router", # llama4 jamba
421 "encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
422 "model.layers.{bid}.mlp.router", # openai-moe
423 "model.layers.{bid}.mlp.gate.wg", # hunyuan
424 "model.layers.{bid}.block_sparse_moe.primary_router", # smallthinker
425 "model.layers.{bid}.feed_forward.gate", # lfm2moe
426 "model.layers.{bid}.mlp.router.gate", # afmoe
427 "layers.{bid}.gate", # mistral-large
428 "backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
429 "model.layers.{bid}.moe.gate", # step3.5
430 ),
431
432 MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
433 "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
434 ),
435
436 MODEL_TENSOR.FFN_EXP_PROBS_B: (
437 "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
438 "model.layers.{bid}.mlp.moe_statics.e_score_correction", # ernie4.5-moe
439 "model.layers.{bid}.mlp.gate.expert_bias", # bailingmoe2
440 "model.layers.{bid}.mlp.expert_bias", # afmoe
441 "model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
442 "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
443 "backbone.layers.{bid}.mixer.gate.e_score_correction", # nemotron-h-moe
444 "model.layers.{bid}.mlp.e_score_correction", # exaone-moe
445 "model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi
446 "model.layers.{bid}.moe.router_bias", # step3.5 expert selection bias
447 ),
448
449 # Feed-forward up
450 MODEL_TENSOR.FFN_UP: (
451 "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
452 "transformer.h.{bid}.mlp.c_fc", # gpt2 jais
453 "transformer.blocks.{bid}.ffn.up_proj", # mpt
454 "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
455 "h.{bid}.mlp.dense_h_to_4h", # bloom
456 "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
457 "layers.{bid}.mlp.up_proj", # embeddinggemma
458 "layers.{bid}.feed_forward.w3", # llama-pth
459 "encoder.layer.{bid}.intermediate.dense", # bert
460 "layers.{bid}.mlp.Wi", # modern-bert
461 "transformer.layer.{bid}.ffn.lin1", # distillbert
462 "transformer.h.{bid}.mlp.fc_in", # gpt-j
463 "transformer.h.{bid}.mlp.linear_3", # refact
464 "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
465 "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
466 "transformer.h.{bid}.mlp.w1", # qwen
467 "h.{bid}.mlp.c_fc", # gpt2
468 "transformer.h.{bid}.mlp.fc1", # phi2
469 "model.layers.{bid}.mlp.fc1", # phi2
470 "model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414
471 "model.layers.layers.{bid}.mlp.up_proj", # plamo
472 "model.layers.layers.{bid}.mlp.gate_up_proj", # plamo2
473 "model.layers.{bid}.feed_forward.w3", # internlm2
474 "encoder.layers.{bid}.mlp.fc11", # nomic-bert
475 "encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
476 "model.layers.{bid}.mlp.c_fc", # starcoder2
477 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 (split up/gate, no longer used)
478 "encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU)
479 "encoder.layer.{bid}.mlp.up_gated_layer", # jina-v2-code (GEGLU)
480 "model.layers.{bid}.residual_mlp.w3", # arctic
481 "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
482 "transformer.h.{bid}.mlp.c_fc_1", # exaone
483 "model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid
484 "transformer_encoder.{bid}.ffn.w12", # neobert
485 "model.layers.{bid}.block_sparse_moe.up", # smallthinker
486 "model.transformer.blocks.{bid}.up_proj", # llada
487 "layers.{bid}.mlp.up_proj", # qwen3-embedding
488 "backbone.layers.{bid}.mixer.up_proj", # nemotron-h
489 "model.layers.{bid}.mlp.language_mlp.up_proj", # cogvlm
490 ),
491
492 MODEL_TENSOR.FFN_UP_EXP: (
493 "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
494 "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
495 "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
496 "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe, nemotron-h-moe (merged)
497 "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
498 "model.layers.{bid}.feed_forward.experts.up_proj", # llama4
499 "encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
500 "model.layers.{bid}.block_sparse_moe.experts.up", # smallthinker
501 "model.layers.{bid}.moe.up_proj", # step3.5
502 ),
503
504 MODEL_TENSOR.FFN_UP_SHEXP: (
505 "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
506 "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
507 "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
508 "model.layers.{bid}.feed_forward.down_proj",
509 "model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
510 "layers.{bid}.shared_experts.w3", # mistral-large
511 "backbone.layers.{bid}.mixer.shared_experts.up_proj", # nemotron-h-moe
512 "model.layers.{bid}.block_sparse_moe.shared_experts.up_proj", # kimi
513 "model.layers.{bid}.share_expert.up_proj", # step3.5
514 ),
515
516 MODEL_TENSOR.FFN_UP_CHEXP: (
517 "model.layers.{bid}.mlp.chunk_experts.up_proj", # grovemoe
518 ),
519
520 # AWQ-activation gate
521 MODEL_TENSOR.FFN_ACT: (
522 "transformer.blocks.{bid}.ffn.act", # mpt
523 ),
524
525 # Feed-forward gate
526 MODEL_TENSOR.FFN_GATE: (
527 "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
528 "layers.{bid}.mlp.gate_proj", # embeddinggemma
529 "layers.{bid}.feed_forward.w1", # llama-pth
530 "transformer.h.{bid}.mlp.w2", # qwen
531 "transformer.h.{bid}.mlp.c_fc2", # jais
532 "model.layers.layers.{bid}.mlp.gate_proj", # plamo
533 "model.layers.{bid}.feed_forward.w1", # internlm2
534 "encoder.layers.{bid}.mlp.fc12", # nomic-bert
535 "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used)
536 "transformer.h.{bid}.mlp.linear_1", # refact
537 "model.layers.{bid}.residual_mlp.w1", # arctic
538 "transformer.h.{bid}.mlp.c_fc_0", # exaone
539 "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid
540 "model.transformer.blocks.{bid}.ff_proj", # llada
541 "layers.{bid}.mlp.gate_proj", # qwen3-embedding
542 "model.layers.{bid}.mlp.language_mlp.gate_proj", # cogvlm
543 ),
544
545 MODEL_TENSOR.FFN_GATE_EXP: (
546 "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
547 "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
548 "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
549 "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ernie4.5-moe
550 "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
551 "model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
552 "model.layers.{bid}.block_sparse_moe.experts.gate", # smallthinker
553 "model.layers.{bid}.moe.gate_proj", # step3.5
554 ),
555
556 MODEL_TENSOR.FFN_GATE_SHEXP: (
557 "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
558 "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
559 "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
560 "model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan
561 "layers.{bid}.shared_experts.w1", # mistral-large
562 "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi
563 "model.layers.{bid}.share_expert.gate_proj", # step3.5
564 ),
565
566 MODEL_TENSOR.FFN_GATE_CHEXP: (
567 "model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe
568 ),
569
570 # Feed-forward down
571 MODEL_TENSOR.FFN_DOWN: (
572 "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
573 "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais
574 "transformer.blocks.{bid}.ffn.down_proj", # mpt
575 "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
576 "h.{bid}.mlp.dense_4h_to_h", # bloom
577 "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
578 "layers.{bid}.mlp.down_proj", # embeddinggemma
579 "layers.{bid}.feed_forward.w2", # llama-pth
580 "encoder.layer.{bid}.output.dense", # bert
581 "layers.{bid}.mlp.Wo", # modern-bert
582 "transformer.layer.{bid}.ffn.lin2", # distillbert
583 "transformer.h.{bid}.mlp.fc_out", # gpt-j
584 "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
585 "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
586 "h.{bid}.mlp.c_proj", # gpt2
587 "transformer.h.{bid}.mlp.fc2", # phi2
588 "model.layers.{bid}.mlp.fc2", # phi2
589 "model.layers.layers.{bid}.mlp.down_proj", # plamo
590 "model.layers.{bid}.feed_forward.w2", # internlm2
591 "encoder.layers.{bid}.mlp.fc2", # nomic-bert
592 "model.layers.{bid}.mlp.c_proj", # starcoder2
593 "encoder.layer.{bid}.mlp.wo", # jina-bert-v2
594 "transformer.layers.{bid}.ffn.proj_2", # openelm
595 "model.layers.{bid}.residual_mlp.w2", # arctic
596 "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
597 "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
598 "model.layers.h.{bid}.mlp.c_proj", # exaone
599 "model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid
600 "transformer_encoder.{bid}.ffn.w3", # neobert
601 "model.layers.{bid}.block_sparse_moe.down", # smallthinker
602 "model.transformer.blocks.{bid}.ff_out", # llada
603 "layers.{bid}.mlp.down_proj", # qwen3-embedding
604 "backbone.layers.{bid}.mixer.down_proj", # nemotron-h
605 "model.layers.{bid}.mlp.language_mlp.down_proj", # cogvlm
606 ),
607
608 MODEL_TENSOR.FFN_DOWN_EXP: (
609 "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
610 "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
611 "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
612 "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe nemotron-h-moe (merged)
613 "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
614 "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
615 "model.layers.{bid}.feed_forward.experts.down_proj", # llama4
616 "encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
617 "model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker
618 "model.layers.{bid}.moe.down_proj", # step3.5
619 ),
620
621 MODEL_TENSOR.FFN_DOWN_SHEXP: (
622 "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
623 "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
624 "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
625 "model.layers.{bid}.shared_mlp.output_linear", # granitemoe
626 "model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
627 "layers.{bid}.shared_experts.w2", # mistral-large
628 "backbone.layers.{bid}.mixer.shared_experts.down_proj", # nemotron-h-moe
629 "model.layers.{bid}.block_sparse_moe.shared_experts.down_proj", # kimi
630 "model.layers.{bid}.share_expert.down_proj", # step3.5
631 ),
632
633 MODEL_TENSOR.FFN_DOWN_CHEXP: (
634 "model.layers.{bid}.mlp.chunk_experts.down_proj", # grovemoe
635 ),
636
637 MODEL_TENSOR.ATTN_Q_NORM: (
638 "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
639 "model.layers.{bid}.self_attn.q_layernorm", # persimmon
640 "model.layers.{bid}.self_attn.query_layernorm", # hunyuan
641 "model.layers.{bid}.attention.query_layernorm", # bailingmoe2
642 "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
643 "layers.{bid}.self_attn.q_norm", # embeddinggemma
644 "transformer.blocks.{bid}.attn.q_ln", # sea-lion
645 "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
646 "transformer.layers.{bid}.attn.q_norm", # openelm
647 "model.layers.layers.{bid}.mixer.q", # plamo2
648 "model.layers.layers.{bid}.mixer.q_norm", # plamo3
649 "layers.{bid}.self_attn.q_norm", # qwen3-embedding
650 "model.layers.{bid}.attention.query_layernorm", # apertus
651 ),
652
653 MODEL_TENSOR.ATTN_K_NORM: (
654 "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
655 "model.layers.{bid}.self_attn.k_layernorm", # persimmon
656 "model.layers.{bid}.self_attn.key_layernorm", # hunyuan
657 "model.layers.{bid}.attention.key_layernorm", # bailingmoe2
658 "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
659 "layers.{bid}.self_attn.k_norm", # embeddinggemma
660 "transformer.blocks.{bid}.attn.k_ln", # sea-lion
661 "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
662 "transformer.layers.{bid}.attn.k_norm", # openelm
663 "model.layers.layers.{bid}.mixer.k", # plamo2
664 "model.layers.layers.{bid}.mixer.k_norm", # plamo3
665 "layers.{bid}.self_attn.k_norm", # qwen3-embedding
666 "model.layers.{bid}.attention.key_layernorm", # apertus
667 ),
668
669 MODEL_TENSOR.ROPE_FREQS: (
670 "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
671 ),
672
673 MODEL_TENSOR.LAYER_OUT_NORM: (
674 "encoder.layer.{bid}.output.LayerNorm", # bert
675 "transformer.layer.{bid}.output_layer_norm", # distillbert
676 "encoder.layers.{bid}.norm2", # nomic-bert
677 "transformer.decoder_layer.{bid}.rms_norm_3", # Grok
678 "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
679 "encoder.layer.{bid}.layer_norm_2", # jina-v2-code
680 "model.layers.{bid}.final_layernorm", # bailingmoe2
681 ),
682
683 MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
684 "model.embed_tokens_per_layer", # gemma3n
685 ),
686
687 MODEL_TENSOR.PER_LAYER_MODEL_PROJ: (
688 "model.per_layer_model_projection", # gemma3n
689 ),
690
691 MODEL_TENSOR.PER_LAYER_PROJ_NORM: (
692 "model.per_layer_projection_norm", # gemma3n
693 ),
694
695 MODEL_TENSOR.ALTUP_PROJ: (
696 "model.altup_projections", # gemma3n
697 ),
698
699 MODEL_TENSOR.ALTUP_UNEMBD_PROJ: (
700 "model.altup_unembed_projections", # gemma3n
701 ),
702
703 MODEL_TENSOR.PER_LAYER_INP_GATE: (
704 "model.layers.{bid}.per_layer_input_gate", # gemma3n
705 ),
706
707 MODEL_TENSOR.PER_LAYER_PROJ: (
708 "model.layers.{bid}.per_layer_projection", # gemma3n
709 ),
710
711 MODEL_TENSOR.PER_LAYER_POST_NORM: (
712 "model.layers.{bid}.post_per_layer_input_norm", # gemma3n
713 ),
714
715 MODEL_TENSOR.ALTUP_CORRECT_COEF: (
716 "model.layers.{bid}.altup.correction_coefs", # gemma3n
717 ),
718
719 MODEL_TENSOR.ALTUP_CORRECT_SCALE: (
720 "model.layers.{bid}.altup.correct_output_scale", # gemma3n
721 ),
722
723 MODEL_TENSOR.ALTUP_PREDICT_COEF: (
724 "model.layers.{bid}.altup.prediction_coefs", # gemma3n
725 ),
726
727 MODEL_TENSOR.ALTUP_ROUTER: (
728 "model.layers.{bid}.altup.modality_router", # gemma3n
729 ),
730
731 MODEL_TENSOR.ALTUP_ROUTER_NORM: (
732 "model.layers.{bid}.altup.router_norm", # gemma3n
733 ),
734
735 MODEL_TENSOR.LAUREL_L: (
736 "model.layers.{bid}.laurel.linear_left", # gemma3n
737 ),
738
739 MODEL_TENSOR.LAUREL_R: (
740 "model.layers.{bid}.laurel.linear_right", # gemma3n
741 ),
742
743 MODEL_TENSOR.LAUREL_POST_NORM: (
744 "model.layers.{bid}.laurel.post_laurel_norm", # gemma3n
745 ),
746
747 MODEL_TENSOR.SSM_IN: (
748 "model.layers.{bid}.in_proj", # mamba-hf
749 "backbone.layers.{bid}.mixer.in_proj", # mamba
750 "model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid
751 "model.layers.layers.{bid}.mixer.in_proj", # plamo2
752 "model.layers.{bid}.linear_attn.in_proj_qkvz", # qwen3next
753 ),
754
755 MODEL_TENSOR.SSM_CONV1D: (
756 "model.layers.{bid}.conv1d", # mamba-hf
757 "backbone.layers.{bid}.mixer.conv1d", # mamba
758 "model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid
759 "model.layers.layers.{bid}.mixer.conv1d", # plamo2
760 "model.layers.{bid}.linear_attn.conv1d", # qwen3next
761 ),
762
763 MODEL_TENSOR.SSM_X: (
764 "model.layers.{bid}.x_proj", # mamba-hf
765 "backbone.layers.{bid}.mixer.x_proj", # mamba
766 "model.layers.{bid}.mamba.x_proj", # jamba
767 "model.layers.layers.{bid}.mixer.bcdt_proj", # plamo2
768 ),
769
770 MODEL_TENSOR.SSM_DT: (
771 "model.layers.{bid}.dt_proj", # mamba-hf
772 "backbone.layers.{bid}.mixer.dt_proj", # mamba
773 "model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
774 "model.layers.layers.{bid}.mixer.dt_proj", # plamo2
775 "model.layers.{bid}.linear_attn.dt_proj", # qwen3next
776 "backbone.layers.{bid}.mixer.dt", # nemotron-h-moe
777 "model.layers.{bid}.self_attn.dt_proj", # kimi
778 ),
779
780 MODEL_TENSOR.SSM_DT_NORM: (
781 "model.layers.layers.{bid}.mixer.dt_norm.weight", # plamo2
782 "model.layers.{bid}.mamba.dt_layernorm", # jamba
783 ),
784
785 MODEL_TENSOR.SSM_A: (
786 "model.layers.{bid}.A_log", # mamba-hf
787 "backbone.layers.{bid}.mixer.A_log", # mamba
788 "model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid
789 "model.layers.layers.{bid}.mixer.A_log", # plamo2
790 "model.layers.{bid}.linear_attn.A_log", # qwen3next
791 "model.layers.{bid}.self_attn.A_log", # kimi
792 ),
793
794 MODEL_TENSOR.SSM_B_NORM: (
795 "model.layers.{bid}.mamba.b_layernorm", # jamba
796 "model.layers.{bid}.mamba.B_layernorm", # mini-jamba
797 "model.layers.layers.{bid}.mixer.B_norm.weight", # plamo2
798 ),
799
800 MODEL_TENSOR.SSM_C_NORM: (
801 "model.layers.{bid}.mamba.c_layernorm", # jamba
802 "model.layers.{bid}.mamba.C_layernorm", # mini-jamba
803 "model.layers.layers.{bid}.mixer.C_norm.weight", # plamo2
804 ),
805
806 MODEL_TENSOR.SSM_D: (
807 "model.layers.{bid}.D", # mamba-hf
808 "backbone.layers.{bid}.mixer.D", # mamba
809 "model.layers.{bid}.mamba.D", # jamba falcon-h1 granite-hybrid
810 "model.layers.layers.{bid}.mixer.D", # plamo2
811 ),
812
813 MODEL_TENSOR.SSM_NORM: (
814 "model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
815 "model.layers.{bid}.linear_attn.norm", # qwen3next
816 "backbone.layers.{bid}.mixer.norm", # mamba2
817 "model.layers.{bid}.self_attn.o_norm", # kimi
818 ),
819
820 MODEL_TENSOR.SSM_OUT: (
821 "model.layers.{bid}.out_proj", # mamba-hf
822 "backbone.layers.{bid}.mixer.out_proj", # mamba
823 "model.layers.{bid}.mamba.out_proj", # jamba falcon-h1 granite-hybrid
824 "model.layers.{bid}.linear_attn.out_proj", # qwen3next
825 "model.layers.layers.{bid}.mixer.out_proj", # plamo2
826 ),
827
828 MODEL_TENSOR.SSM_ALPHA: (
829 "model.layers.{bid}.linear_attn.in_proj_a", # qwen3.5
830 ),
831
832 MODEL_TENSOR.SSM_BETA_ALPHA: (
833 "model.layers.{bid}.linear_attn.in_proj_ba", # qwen3next
834 ),
835
836 # Kimi Linear KDA (using SSM_ prefix for consistency)
837 MODEL_TENSOR.SSM_CONV1D_Q: (
838 "model.layers.{bid}.self_attn.q_conv1d",
839 ),
840 MODEL_TENSOR.SSM_CONV1D_K: (
841 "model.layers.{bid}.self_attn.k_conv1d",
842 ),
843 MODEL_TENSOR.SSM_CONV1D_V: (
844 "model.layers.{bid}.self_attn.v_conv1d",
845 ),
846 MODEL_TENSOR.SSM_F_A: (
847 "model.layers.{bid}.self_attn.f_a_proj",
848 ),
849 MODEL_TENSOR.SSM_F_B: (
850 "model.layers.{bid}.self_attn.f_b_proj",
851 ),
852 MODEL_TENSOR.SSM_BETA: (
853 "model.layers.{bid}.linear_attn.in_proj_b", # qwen3.5
854 "model.layers.{bid}.self_attn.b_proj", # Kimi Linear
855 ),
856 MODEL_TENSOR.SSM_G_A: (
857 "model.layers.{bid}.self_attn.g_a_proj",
858 ),
859 MODEL_TENSOR.SSM_G_B: (
860 "model.layers.{bid}.self_attn.g_b_proj",
861 ),
862 MODEL_TENSOR.TIME_MIX_W0: (
863 "model.layers.{bid}.attention.w0", # rwkv7
864 ),
865
866 MODEL_TENSOR.TIME_MIX_W1: (
867 "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6
868 "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2
869 "model.layers.{bid}.attention.w1", # rwkv7
870 ),
871
872 MODEL_TENSOR.TIME_MIX_W2: (
873 "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6
874 "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2
875 "model.layers.{bid}.attention.w2", # rwkv7
876 ),
877
878 MODEL_TENSOR.TIME_MIX_A0: (
879 "model.layers.{bid}.attention.a0", # rwkv7
880 ),
881
882 MODEL_TENSOR.TIME_MIX_A1: (
883 "model.layers.{bid}.attention.a1", # rwkv7
884 ),
885
886 MODEL_TENSOR.TIME_MIX_A2: (
887 "model.layers.{bid}.attention.a2", # rwkv7
888 ),
889
890 MODEL_TENSOR.TIME_MIX_V0: (
891 "model.layers.{bid}.attention.v0", # rwkv7
892 ),
893
894 MODEL_TENSOR.TIME_MIX_V1: (
895 "model.layers.{bid}.attention.v1", # rwkv7
896 ),
897
898 MODEL_TENSOR.TIME_MIX_V2: (
899 "model.layers.{bid}.attention.v2", # rwkv7
900 ),
901
902 MODEL_TENSOR.TIME_MIX_G1: (
903 "model.layers.{bid}.attention.g1", # rwkv7
904 ),
905
906 MODEL_TENSOR.TIME_MIX_G2: (
907 "model.layers.{bid}.attention.g2", # rwkv7
908 ),
909
910 MODEL_TENSOR.TIME_MIX_K_K: (
911 "model.layers.{bid}.attention.k_k", # rwkv7
912 ),
913
914 MODEL_TENSOR.TIME_MIX_K_A: (
915 "model.layers.{bid}.attention.k_a", # rwkv7
916 ),
917
918 MODEL_TENSOR.TIME_MIX_R_K: (
919 "model.layers.{bid}.attention.r_k", # rwkv7
920 ),
921
922 MODEL_TENSOR.TIME_MIX_LERP_X: (
923 "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv6
924 "model.layers.{bid}.self_attn.time_maa_x", # rwkv6qwen2
925 ),
926
927 MODEL_TENSOR.TIME_MIX_LERP_K: (
928 "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv6
929 "model.layers.{bid}.self_attn.time_maa_k", # rwkv6qwen2
930 ),
931
932 MODEL_TENSOR.TIME_MIX_LERP_V: (
933 "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv6
934 "model.layers.{bid}.self_attn.time_maa_v", # rwkv6qwen2
935 ),
936
937 MODEL_TENSOR.TIME_MIX_LERP_R: (
938 "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv6
939 "model.layers.{bid}.self_attn.time_maa_r", # rwkv6qwen2
940 ),
941
942 MODEL_TENSOR.TIME_MIX_LERP_G: (
943 "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv6
944 "model.layers.{bid}.self_attn.time_maa_g", # rwkv6qwen2
945 ),
946
947 MODEL_TENSOR.TIME_MIX_LERP_W: (
948 "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv6
949 "model.layers.{bid}.self_attn.time_maa_w", # rwkv6qwen2
950 ),
951
952 MODEL_TENSOR.TIME_MIX_FIRST: (
953 "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv6
954 ),
955
956 MODEL_TENSOR.TIME_MIX_DECAY: (
957 "rwkv.blocks.{bid}.attention.time_decay", # rwkv6
958 "model.layers.{bid}.self_attn.time_decay", # rwkv6qwen2
959 ),
960
961 MODEL_TENSOR.TIME_MIX_DECAY_W1: (
962 "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv6
963 "model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2
964 ),
965
966 MODEL_TENSOR.TIME_MIX_DECAY_W2: (
967 "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv6
968 "model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2
969 ),
970
971 MODEL_TENSOR.TIME_MIX_KEY: (
972 "rwkv.blocks.{bid}.attention.key", # rwkv6
973 "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2
974 "model.layers.{bid}.attention.key", # rwkv7
975 "model.layers.{bid}.attention.k_proj", # rwkv7
976 ),
977
978 MODEL_TENSOR.TIME_MIX_VALUE: (
979 "rwkv.blocks.{bid}.attention.value", # rwkv6
980 "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2
981 "model.layers.{bid}.attention.value", # rwkv7
982 "model.layers.{bid}.attention.v_proj", # rwkv7
983 ),
984
985 MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
986 "rwkv.blocks.{bid}.attention.receptance", # rwkv6
987 "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2
988 "model.layers.{bid}.attention.receptance", # rwkv7
989 "model.layers.{bid}.attention.r_proj", # rwkv7
990 ),
991
992 MODEL_TENSOR.TIME_MIX_GATE: (
993 "rwkv.blocks.{bid}.attention.gate", # rwkv6
994 "model.layers.{bid}.self_attn.gate", # rwkv6qwen2
995 ),
996
997 MODEL_TENSOR.TIME_MIX_LN: (
998 "rwkv.blocks.{bid}.attention.ln_x", # rwkv6
999 "model.layers.{bid}.attention.ln_x" # rwkv7
1000 ),
1001
1002 MODEL_TENSOR.TIME_MIX_OUTPUT: (
1003 "rwkv.blocks.{bid}.attention.output", # rwkv6
1004 "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2
1005 "model.layers.{bid}.attention.output", # rwkv7
1006 "model.layers.{bid}.attention.o_proj", # rwkv7
1007 ),
1008
1009 MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
1010 "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6
1011 "model.layers.{bid}.feed_forward.x_k", # rwkv7
1012 ),
1013
1014 MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
1015 "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv6
1016 ),
1017
1018 MODEL_TENSOR.CHANNEL_MIX_KEY: (
1019 "rwkv.blocks.{bid}.feed_forward.key", # rwkv6
1020 "model.layers.{bid}.feed_forward.key", # rwkv7
1021 ),
1022
1023 MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
1024 "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv6
1025 ),
1026
1027 MODEL_TENSOR.CHANNEL_MIX_VALUE: (
1028 "rwkv.blocks.{bid}.feed_forward.value", # rwkv6
1029 "model.layers.{bid}.feed_forward.value", # rwkv7
1030 ),
1031
1032 MODEL_TENSOR.ATTN_Q_A: (
1033 "model.layers.{bid}.self_attn.q_a_proj", # deepseek2
1034 "layers.{bid}.attention.wq_a", # mistral-large
1035 ),
1036
1037 MODEL_TENSOR.ATTN_Q_B: (
1038 "model.layers.{bid}.self_attn.q_b_proj", # deepseek2
1039 "layers.{bid}.attention.wq_b", # mistral-large
1040 ),
1041
1042 MODEL_TENSOR.ATTN_KV_A_MQA: (
1043 "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
1044 "layers.{bid}.attention.wkv_a_with_mqa", # mistral-large
1045 ),
1046
1047 MODEL_TENSOR.ATTN_KV_B: (
1048 "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
1049 ),
1050
1051 MODEL_TENSOR.ATTN_K_B: (
1052 "model.layers.{bid}.self_attn.k_b_proj", # deepseek2
1053 "layers.{bid}.attention.k_b_proj", # mistral-large
1054 ),
1055
1056 MODEL_TENSOR.ATTN_V_B: (
1057 "model.layers.{bid}.self_attn.v_b_proj", # deepseek2
1058 "layers.{bid}.attention.v_b_proj", # mistral-large
1059 ),
1060
1061 MODEL_TENSOR.ATTN_Q_A_NORM: (
1062 "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
1063 "layers.{bid}.attention.q_a_norm", # mistral-large
1064 ),
1065
1066 MODEL_TENSOR.ATTN_KV_A_NORM: (
1067 "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
1068 "layers.{bid}.attention.kv_a_norm", # mistral-large
1069 ),
1070
1071 MODEL_TENSOR.ATTN_SUB_NORM: (
1072 "model.layers.{bid}.self_attn.inner_attn_ln", # bitnet
1073 ),
1074
1075 MODEL_TENSOR.FFN_SUB_NORM: (
1076 "model.layers.{bid}.mlp.ffn_layernorm", # bitnet
1077 ),
1078
1079 MODEL_TENSOR.DEC_ATTN_NORM: (
1080 "decoder.block.{bid}.layer.0.layer_norm", # t5
1081 ),
1082
1083 MODEL_TENSOR.DEC_ATTN_Q: (
1084 "decoder.block.{bid}.layer.0.SelfAttention.q", # t5
1085 ),
1086
1087 MODEL_TENSOR.DEC_ATTN_K: (
1088 "decoder.block.{bid}.layer.0.SelfAttention.k", # t5
1089 ),
1090
1091 MODEL_TENSOR.DEC_ATTN_V: (
1092 "decoder.block.{bid}.layer.0.SelfAttention.v", # t5
1093 ),
1094
1095 MODEL_TENSOR.DEC_ATTN_OUT: (
1096 "decoder.block.{bid}.layer.0.SelfAttention.o", # t5
1097 ),
1098
1099 MODEL_TENSOR.DEC_ATTN_REL_B: (
1100 "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
1101 ),
1102
1103 MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
1104 "decoder.block.{bid}.layer.1.layer_norm", # t5
1105 ),
1106
1107 MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
1108 "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
1109 ),
1110
1111 MODEL_TENSOR.DEC_CROSS_ATTN_K: (
1112 "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
1113 ),
1114
1115 MODEL_TENSOR.DEC_CROSS_ATTN_V: (
1116 "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
1117 ),
1118
1119 MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
1120 "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
1121 ),
1122
1123 MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
1124 "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
1125 ),
1126
1127 MODEL_TENSOR.DEC_FFN_NORM: (
1128 "decoder.block.{bid}.layer.2.layer_norm", # t5
1129 ),
1130
1131 MODEL_TENSOR.DEC_FFN_GATE: (
1132 "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
1133 ),
1134
1135 MODEL_TENSOR.DEC_FFN_UP: (
1136 "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
1137 "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
1138 ),
1139
1140 MODEL_TENSOR.DEC_FFN_DOWN: (
1141 "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
1142 ),
1143
1144 MODEL_TENSOR.DEC_OUTPUT_NORM: (
1145 "decoder.final_layer_norm", # t5
1146 ),
1147
1148 MODEL_TENSOR.ENC_ATTN_NORM: (
1149 "encoder.block.{bid}.layer.0.layer_norm", # t5
1150 ),
1151
1152 MODEL_TENSOR.ENC_ATTN_Q: (
1153 "encoder.block.{bid}.layer.0.SelfAttention.q", # t5
1154 ),
1155
1156 MODEL_TENSOR.ENC_ATTN_K: (
1157 "encoder.block.{bid}.layer.0.SelfAttention.k", # t5
1158 ),
1159
1160 MODEL_TENSOR.ENC_ATTN_V: (
1161 "encoder.block.{bid}.layer.0.SelfAttention.v", # t5
1162 ),
1163
1164 MODEL_TENSOR.ENC_ATTN_OUT: (
1165 "encoder.block.{bid}.layer.0.SelfAttention.o", # t5
1166 ),
1167
1168 MODEL_TENSOR.ENC_ATTN_REL_B: (
1169 "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
1170 ),
1171
1172 MODEL_TENSOR.ENC_FFN_NORM: (
1173 "encoder.block.{bid}.layer.1.layer_norm", # t5
1174 ),
1175
1176 MODEL_TENSOR.ENC_FFN_GATE: (
1177 "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
1178 ),
1179
1180 MODEL_TENSOR.ENC_FFN_UP: (
1181 "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
1182 "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
1183 ),
1184
1185 MODEL_TENSOR.ENC_FFN_DOWN: (
1186 "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
1187 ),
1188
1189 MODEL_TENSOR.VISEXP_UP: (
1190 "model.layers.{bid}.mlp.vision_mlp.up_proj", # cogvlm
1191 ),
1192
1193 MODEL_TENSOR.VISEXP_GATE: (
1194 "model.layers.{bid}.mlp.vision_mlp.gate_proj", # cogvlm
1195 ),
1196
1197 MODEL_TENSOR.VISEXP_DOWN: (
1198 "model.layers.{bid}.mlp.vision_mlp.down_proj", # cogvlm
1199 ),
1200
1201 MODEL_TENSOR.VISEXP_ATTN_OUT: (
1202 "model.layers.{bid}.self_attn.vision_expert_dense", # cogvlm
1203 ),
1204
1205 MODEL_TENSOR.VISEXP_ATTN_QKV: (
1206 "model.layers.{bid}.self_attn.vision_expert_query_key_value", # cogvlm
1207 ),
1208
1209 ############################################################################
1210 # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
1211 MODEL_TENSOR.ENC_OUTPUT_NORM: (
1212 "encoder.final_layer_norm", # t5
1213 "layer_norm", # neobert
1214 ),
1215
1216 MODEL_TENSOR.CLS: (
1217 "classifier", # jina
1218 "classifier.dense", # roberta
1219 "pre_classifier", # distillbert
1220 "dense", # neobert
1221 "head.dense", # modern-bert
1222 ),
1223
1224 MODEL_TENSOR.CLS_OUT: (
1225 "classifier.out_proj", # roberta
1226 ),
1227 #############################################################################
1228
1229 MODEL_TENSOR.CONVNEXT_DW: (
1230 "backbone.convnext.{bid}.dwconv", # wavtokenizer
1231 ),
1232
1233 MODEL_TENSOR.CONVNEXT_NORM: (
1234 "backbone.convnext.{bid}.norm", # wavtokenizer
1235 ),
1236
1237 MODEL_TENSOR.CONVNEXT_PW1: (
1238 "backbone.convnext.{bid}.pwconv1", # wavtokenizer
1239 ),
1240
1241 MODEL_TENSOR.CONVNEXT_PW2: (
1242 "backbone.convnext.{bid}.pwconv2", # wavtokenizer
1243 ),
1244
1245 MODEL_TENSOR.CONVNEXT_GAMMA: (
1246 "backbone.convnext.{bid}.gamma", # wavtokenizer
1247 ),
1248
1249 MODEL_TENSOR.POSNET_CONV1: (
1250 "backbone.posnet.{bid}.conv1", # wavtokenizer
1251 ),
1252
1253 MODEL_TENSOR.POSNET_CONV2: (
1254 "backbone.posnet.{bid}.conv2", # wavtokenizer
1255 ),
1256
1257 MODEL_TENSOR.POSNET_NORM: (
1258 "backbone.posnet.{bid}.norm", # wavtokenizer
1259 ),
1260
1261 MODEL_TENSOR.POSNET_NORM1: (
1262 "backbone.posnet.{bid}.norm1", # wavtokenizer
1263 ),
1264
1265 MODEL_TENSOR.POSNET_NORM2: (
1266 "backbone.posnet.{bid}.norm2", # wavtokenizer
1267 ),
1268
1269 MODEL_TENSOR.POSNET_ATTN_NORM: (
1270 "backbone.posnet.{bid}.norm", # wavtokenizer
1271 ),
1272
1273 MODEL_TENSOR.POSNET_ATTN_Q: (
1274 "backbone.posnet.{bid}.q", # wavtokenizer
1275 ),
1276
1277 MODEL_TENSOR.POSNET_ATTN_K: (
1278 "backbone.posnet.{bid}.k", # wavtokenizer
1279 ),
1280
1281 MODEL_TENSOR.POSNET_ATTN_V: (
1282 "backbone.posnet.{bid}.v", # wavtokenizer
1283 ),
1284
1285 MODEL_TENSOR.POSNET_ATTN_OUT: (
1286 "backbone.posnet.{bid}.proj_out", # wavtokenizer
1287 ),
1288
1289 MODEL_TENSOR.SHORTCONV_CONV: (
1290 "model.layers.{bid}.conv.conv",
1291 ),
1292
1293 MODEL_TENSOR.SHORTCONV_INPROJ: (
1294 "model.layers.{bid}.conv.in_proj",
1295 ),
1296
1297 MODEL_TENSOR.SHORTCONV_OUTPROJ: (
1298 "model.layers.{bid}.conv.out_proj",
1299 ),
1300
1301 #############################################################################
1302 ## Vision encoder
1303
1304 MODEL_TENSOR.V_MMPROJ: (
1305 "multi_modal_projector.linear_{bid}",
1306 "mm_projector.proj.linear_{bid}", # Kimi-K2.5
1307 "visual.merger.mlp.{bid}", # qwen2vl
1308 "merger.mlp.{bid}",
1309 ),
1310
1311 MODEL_TENSOR.V_MMPROJ_FC: (
1312 "model.connector.modality_projection.proj", # SmolVLM
1313 "model.vision.linear_proj.linear_proj", # cogvlm
1314 "visual.merger.proj", # glm4v
1315 ),
1316
1317 MODEL_TENSOR.V_MMPROJ_MLP: (
1318 "model.mm_projector.mlp.mlp.{bid}",
1319 "vision_model.vision_adapter.mlp.fc{bid}", # llama 4
1320 "mlp1.{bid}", # InternVL
1321 "model.aligner.fc1.hidden_layers.{bid}", # Janus Pro
1322 ),
1323
1324 MODEL_TENSOR.V_MMPROJ_PEG: (
1325 "model.mm_projector.peg.peg.{bid}",
1326 ),
1327
1328 MODEL_TENSOR.V_ENC_EMBD_CLS: (
1329 "vision_tower.vision_model.embeddings.class_embedding",
1330 "model.vision_tower.embeddings.cls_token", # Intern-S1
1331 "vision_model.class_embedding", # llama 4
1332 "model.vision.patch_embedding.cls_embedding", # cogvlm
1333 ),
1334
1335 MODEL_TENSOR.V_ENC_EMBD_PATCH: (
1336 "vision_tower.vision_model.embeddings.patch_embedding",
1337 "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
1338 "vpm.embeddings.patch_embedding",
1339 "model.vision_model.embeddings.patch_embedding", # SmolVLM
1340 "vision_tower.patch_conv", # pixtral-hf
1341 "vision_encoder.patch_conv", # pixtral
1342 "vision_model.patch_embedding.linear", # llama 4
1343 "visual.patch_embed.proj", # qwen2vl
1344 "vision_tower.patch_embed.proj", # kimi-vl
1345 "model.vision.patch_embedding.proj", # cogvlm
1346 "siglip2.vision_model.embeddings.patch_embedding",
1347 ),
1348
1349 MODEL_TENSOR.V_ENC_EMBD_NORM: (
1350 "visual.post_conv_layernorm", # glm4v
1351 ),
1352
1353 MODEL_TENSOR.V_ENC_EMBD_POS: (
1354 "vision_tower.vision_model.embeddings.position_embedding",
1355 "model.vision_tower.embeddings.position_embeddings", # Intern-S1
1356 "vpm.embeddings.position_embedding",
1357 "model.vision_model.embeddings.position_embedding", # SmolVLM
1358 "vision_model.positional_embedding_vlm", # llama 4
1359 "vision_tower.patch_embed.pos_emb", # kimi-vl
1360 "visual.pos_embed", # qwen3vl
1361 "model.vision.patch_embedding.position_embedding", # cogvlm
1362 "visual.embeddings.position_embedding", # glm4v
1363 ),
1364
1365 MODEL_TENSOR.V_ENC_ATTN_QKV: (
1366 "visual.blocks.{bid}.attn.qkv", # qwen3vl
1367 "model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
1368 "vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
1369 ),
1370
1371 MODEL_TENSOR.V_ENC_ATTN_Q: (
1372 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
1373 "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
1374 "vpm.encoder.layers.{bid}.self_attn.q_proj",
1375 "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
1376 "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
1377 "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
1378 "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
1379 "visual.blocks.{bid}.attn.q", # qwen2vl, generated
1380 "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
1381 "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
1382 ),
1383
1384 MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
1385 "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
1386 "model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
1387 ),
1388
1389 MODEL_TENSOR.V_ENC_ATTN_K: (
1390 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
1391 "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
1392 "vpm.encoder.layers.{bid}.self_attn.k_proj",
1393 "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
1394 "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
1395 "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
1396 "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
1397 "visual.blocks.{bid}.attn.k", # qwen2vl, generated
1398 "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
1399 "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
1400 ),
1401
1402 MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
1403 "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
1404 "model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
1405 ),
1406
1407 MODEL_TENSOR.V_ENC_ATTN_V: (
1408 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
1409 "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
1410 "vpm.encoder.layers.{bid}.self_attn.v_proj",
1411 "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
1412 "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
1413 "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
1414 "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
1415 "visual.blocks.{bid}.attn.v", # qwen2vl, generated
1416 "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
1417 "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
1418 ),
1419
1420 MODEL_TENSOR.V_ENC_INPUT_NORM: (
1421 "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
1422 "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
1423 "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
1424 "vpm.encoder.layers.{bid}.layer_norm1",
1425 "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
1426 "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
1427 "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
1428 "vision_model.model.layers.{bid}.input_layernorm", # llama4
1429 "visual.blocks.{bid}.norm1", # qwen2vl
1430 "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
1431 "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
1432 "siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
1433 ),
1434
1435 MODEL_TENSOR.V_ENC_ATTN_O: (
1436 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
1437 "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
1438 "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
1439 "vpm.encoder.layers.{bid}.self_attn.out_proj",
1440 "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
1441 "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
1442 "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
1443 "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
1444 "vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral
1445 "visual.blocks.{bid}.attn.proj", # qwen2vl
1446 "vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
1447 "model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
1448 "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
1449 ),
1450
1451 MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
1452 "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
1453 "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
1454 "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
1455 "vpm.encoder.layers.{bid}.layer_norm2",
1456 "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
1457 "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
1458 "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
1459 "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
1460 "visual.blocks.{bid}.norm2", # qwen2vl
1461 "vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
1462 "model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
1463 "siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
1464 ),
1465
1466 MODEL_TENSOR.V_ENC_FFN_UP: (
1467 "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
1468 "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
1469 "vpm.encoder.layers.{bid}.mlp.fc1",
1470 "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
1471 "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
1472 "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
1473 "vision_model.model.layers.{bid}.mlp.fc1", # llama4
1474 "visual.blocks.{bid}.mlp.fc1", # qwen2vl
1475 "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
1476 "visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
1477 "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
1478 "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
1479 "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
1480 ),
1481
1482 MODEL_TENSOR.V_ENC_FFN_GATE: (
1483 "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
1484 "vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
1485 "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
1486 ),
1487
1488 MODEL_TENSOR.V_ENC_FFN_DOWN: (
1489 "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
1490 "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
1491 "vpm.encoder.layers.{bid}.mlp.fc2",
1492 "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
1493 "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
1494 "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
1495 "vision_model.model.layers.{bid}.mlp.fc2", # llama4
1496 "visual.blocks.{bid}.mlp.fc2", # qwen2vl
1497 "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
1498 "visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
1499 "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
1500 "model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
1501 "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
1502 ),
1503
1504 MODEL_TENSOR.V_LAYER_SCALE_1: (
1505 "vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
1506 "model.vision_tower.encoder.layer.{bid}.lambda_1", # Intern-S1
1507 ),
1508
1509 MODEL_TENSOR.V_LAYER_SCALE_2: (
1510 "vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
1511 "model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
1512 ),
1513
1514 MODEL_TENSOR.V_PRE_NORM: (
1515 "vision_tower.vision_model.pre_layrnorm",
1516 "vision_tower.ln_pre", # pixtral-hf
1517 "vision_encoder.ln_pre", # pixtral
1518 "vision_model.layernorm_pre", # llama4
1519 ),
1520
1521 MODEL_TENSOR.V_POST_NORM: (
1522 "vision_tower.vision_model.post_layernorm",
1523 "model.vision_model.post_layernorm", # SmolVLM
1524 "vision_model.layernorm_post", # llama4
1525 "visual.merger.ln_q", # qwen2vl
1526 "vision_tower.encoder.final_layernorm", # kimi-vl
1527 "visual.post_layernorm", # glm4v
1528 "siglip2.vision_model.post_layernorm",
1529 ),
1530
1531 MODEL_TENSOR.V_MM_POST_NORM: (
1532 "visual.merger.post_projection_norm", # glm4v
1533 ),
1534
1535 MODEL_TENSOR.V_MM_INP_PROJ: (
1536 "multi_modal_projector.mm_input_projection",
1537 ),
1538
1539 MODEL_TENSOR.V_MM_INP_NORM: (
1540 "multi_modal_projector.norm",
1541 "multi_modal_projector.layer_norm",
1542 "multi_modal_projector.pre_norm",
1543 "mm_projector.pre_norm", # Kimi-K2.5
1544 "pre_mm_projector_norm",
1545 "model.vision.linear_proj.norm1", # cogvlm
1546 "merger.ln_q",
1547 ),
1548
1549 MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
1550 "multi_modal_projector.mm_soft_emb_norm",
1551 ),
1552
1553 MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
1554 "resampler.pos_embed_k",
1555 ),
1556
1557 MODEL_TENSOR.V_RESMPL_ATTN_Q: (
1558 "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj
1559 ),
1560
1561 MODEL_TENSOR.V_RESMPL_ATTN_K: (
1562 "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj
1563 ),
1564
1565 MODEL_TENSOR.V_RESMPL_ATTN_V: (
1566 "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj
1567 ),
1568
1569 MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
1570 "resampler.attn.out_proj",
1571 ),
1572
1573 MODEL_TENSOR.V_RESMPL_KV: (
1574 "resampler.kv_proj",
1575 ),
1576
1577 MODEL_TENSOR.V_RESMPL_POST_NORM: (
1578 "resampler.ln_post",
1579 ),
1580
1581 MODEL_TENSOR.V_RESMPL_KV_NORM: (
1582 "resampler.ln_kv",
1583 ),
1584
1585 MODEL_TENSOR.V_RESMPL_Q_NORM: (
1586 "resampler.ln_q",
1587 ),
1588
1589 MODEL_TENSOR.V_RESMPL_PROJ: (
1590 "resampler.proj",
1591 ),
1592
1593 MODEL_TENSOR.V_RESMPL_QUERY: (
1594 "resampler.query",
1595 ),
1596
1597 MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: (
1598 "v.token_embd.img_break", # for pixtral, this is a generated vector
1599 ),
1600
1601 MODEL_TENSOR.V_MM_PATCH_MERGER: (
1602 "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
1603 "patch_merger.merging_layer", # mistral
1604 "visual.downsample", # glm4v
1605 ),
1606
1607 MODEL_TENSOR.V_DS_NORM: (
1608 "model.visual.deepstack_merger_list.{bid}.norm", # deepstack in qwen3vl
1609 ),
1610
1611 MODEL_TENSOR.V_DS_FC1: (
1612 "model.visual.deepstack_merger_list.{bid}.linear_fc1", # deepstack in qwen3vl
1613 ),
1614
1615 MODEL_TENSOR.V_DS_FC2: (
1616 "model.visual.deepstack_merger_list.{bid}.linear_fc2", # deepstack in qwen3vl
1617 ),
1618
1619 MODEL_TENSOR.V_MM_POST_FC_NORM: (
1620 "model.vision.linear_proj.norm1", # cogvlm
1621 ),
1622
1623 MODEL_TENSOR.V_MM_UP: (
1624 "model.vision.linear_proj.dense_h_to_4h", # cogvlm
1625 "visual.merger.up_proj", # glm4v
1626 ),
1627
1628 MODEL_TENSOR.V_MM_DOWN: (
1629 "model.vision.linear_proj.dense_4h_to_h", # cogvlm
1630 "visual.merger.down_proj", # glm4v
1631 ),
1632
1633 MODEL_TENSOR.V_MM_GATE: (
1634 "model.vision.linear_proj.gate_proj", # cogvlm
1635 "visual.merger.gate_proj", # glm4v
1636 ),
1637
1638 MODEL_TENSOR.V_TOK_BOI: (
1639 "model.vision.boi", # cogvlm
1640 ),
1641
1642 MODEL_TENSOR.V_TOK_EOI: (
1643 "model.vision.eoi", # cogvlm
1644 ),
1645
1646 # audio (mtmd)
1647
1648 MODEL_TENSOR.A_ENC_EMBD_POS: (
1649 "audio_tower.embed_positions", # ultravox
1650 "audio_embedding.embedding", # lfm2
1651 ),
1652
1653 MODEL_TENSOR.A_ENC_EMBD_NORM: (
1654 "audio_embedding.embedding_norm", # lfm2
1655 ),
1656
1657 MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: (
1658 "audio_embedding.to_logits", # lfm2
1659 ),
1660
1661 MODEL_TENSOR.A_ENC_CONV1D: (
1662 "audio_tower.conv{bid}", # ultravox
1663 "conformer.pre_encode.conv.{bid}", # lfm2
1664 "model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
1665 ),
1666
1667 MODEL_TENSOR.A_ENC_CONV1D_NORM: (
1668 "model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
1669 ),
1670
1671 MODEL_TENSOR.A_PRE_NORM: (),
1672
1673 MODEL_TENSOR.A_POST_NORM: (
1674 "audio_tower.layer_norm", # ultravox
1675 "audio_tower.ln_post", # qwen2omni
1676 ),
1677
1678 MODEL_TENSOR.A_ENC_ATTN_Q: (
1679 "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
1680 "conformer.layers.{bid}.self_attn.linear_q", # lfm2
1681 "conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
1682 ),
1683
1684 MODEL_TENSOR.A_ENC_ATTN_K: (
1685 "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
1686 "conformer.layers.{bid}.self_attn.linear_k", # lfm2
1687 "conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
1688 ),
1689
1690 MODEL_TENSOR.A_ENC_ATTN_V: (
1691 "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
1692 "conformer.layers.{bid}.self_attn.linear_v", # lfm2
1693 "conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
1694 ),
1695
1696 MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
1697 "conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
1698 ),
1699
1700 MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
1701 "conformer.layers.{bid}.norm", # gemma3n
1702 ),
1703
1704 MODEL_TENSOR.A_ENC_INPUT_NORM: (
1705 "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
1706 "conformer.layers.{bid}.norm_self_att", # lfm2
1707 "conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
1708 ),
1709
1710 MODEL_TENSOR.A_ENC_OUTPUT: (
1711 "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
1712 "conformer.layers.{bid}.self_attn.linear_out", # lfm2
1713 "conformer.layers.{bid}.attention.post", # gemma3n
1714 ),
1715
1716 MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
1717 "audio_tower.layers.{bid}.final_layer_norm", # ultravox
1718 "conformer.layers.{bid}.norm_out", # lfm2
1719 "conformer.layers.{bid}.attention.post_norm", # gemma3n
1720 ),
1721
1722 MODEL_TENSOR.A_ENC_FFN_NORM: (
1723 "conformer.layers.{bid}.norm_feed_forward1", # lfm2
1724 "conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
1725 ),
1726
1727 MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
1728 "conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
1729 ),
1730
1731 MODEL_TENSOR.A_ENC_FFN_SCALE: (
1732 "conformer.layers.{bid}.ffw_layer_start.post_layer_scale", # gemma3n
1733 ),
1734
1735 MODEL_TENSOR.A_ENC_FFN_UP: (
1736 "audio_tower.layers.{bid}.fc1", # ultravox
1737 "conformer.layers.{bid}.feed_forward1.linear1", # lfm2
1738 "conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
1739 ),
1740
1741 MODEL_TENSOR.A_ENC_FFN_GATE: (),
1742
1743 MODEL_TENSOR.A_ENC_FFN_DOWN: (
1744 "audio_tower.layers.{bid}.fc2", # ultravox
1745 "conformer.layers.{bid}.feed_forward1.linear2", # lfm2
1746 "conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
1747 ),
1748
1749 MODEL_TENSOR.A_ENC_FFN_UP_1: (
1750 "conformer.layers.{bid}.feed_forward2.linear1", # lfm2
1751 "conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
1752 ),
1753
1754 MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
1755 "conformer.layers.{bid}.feed_forward2.linear2", # lfm2
1756 "conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
1757 ),
1758
1759 MODEL_TENSOR.A_ENC_FFN_NORM_1: (
1760 "conformer.layers.{bid}.norm_feed_forward2", # lfm2
1761 "conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
1762 ),
1763
1764 MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
1765 "conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
1766 ),
1767
1768 MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
1769 "conformer.layers.{bid}.ffw_layer_end.post_layer_scale", # gemma3n
1770 ),
1771
1772 MODEL_TENSOR.A_ENC_LINEAR_POS: (
1773 "conformer.layers.{bid}.self_attn.linear_pos", # lfm2
1774 "conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
1775 ),
1776
1777 MODEL_TENSOR.A_ENC_POS_BIAS_U: (
1778 "conformer.layers.{bid}.self_attn.pos_bias_u", # lfm2
1779 ),
1780
1781 MODEL_TENSOR.A_ENC_POS_BIAS_V: (
1782 "conformer.layers.{bid}.self_attn.pos_bias_v", # lfm2
1783 ),
1784
1785 MODEL_TENSOR.A_ENC_OUT: (
1786 "conformer.pre_encode.out", # lfm2
1787 "model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
1788 ),
1789
1790 # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
1791 # this prefix is added in the conversion code in modify_tensors()
1792
1793 MODEL_TENSOR.A_MMPROJ: (
1794 "audio.multi_modal_projector.linear_{bid}", # ultravox
1795 "audio_adapter.model.{bid}" # lfm2
1796 ),
1797
1798 MODEL_TENSOR.A_MMPROJ_FC: (
1799 "audio.multi_modal_projector.linear", # qwen2audio
1800 "audio_tower.proj", # qwen2omni
1801 ),
1802
1803 MODEL_TENSOR.A_MM_NORM_PRE: (
1804 "audio.multi_modal_projector.ln_pre", # ultravox
1805 ),
1806
1807 MODEL_TENSOR.A_MM_NORM_MID: (
1808 "audio.multi_modal_projector.ln_mid", # ultravox
1809 ),
1810
1811 MODEL_TENSOR.A_ENC_CONV_DW: (
1812 "conformer.layers.{bid}.conv.depthwise_conv", # lfm2
1813 "conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
1814 ),
1815
1816 MODEL_TENSOR.A_ENC_CONV_NORM: (
1817 "conformer.layers.{bid}.conv.batch_norm", # lfm2
1818 "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
1819 ),
1820
1821 MODEL_TENSOR.A_ENC_CONV_PW1: (
1822 "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
1823 "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
1824 ),
1825
1826 MODEL_TENSOR.A_ENC_CONV_PW2: (
1827 "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
1828 "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
1829 ),
1830
1831 MODEL_TENSOR.A_ENC_NORM_CONV: (
1832 "conformer.layers.{bid}.norm_conv", # lfm2
1833 "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
1834 ),
1835
1836 MODEL_TENSOR.A_MM_EMBEDDING: (
1837 "model.embed_audio.embedding", # gemma3n
1838 ),
1839 MODEL_TENSOR.A_MM_HARD_EMB_NORM: (
1840 "model.embed_audio.hard_embedding_norm", # gemma3n
1841 ),
1842 MODEL_TENSOR.A_MM_INP_PROJ: (
1843 "model.embed_audio.embedding_projection", # gemma3n
1844 ),
1845 MODEL_TENSOR.A_MM_SOFT_EMB_NORM: (
1846 "model.embed_audio.soft_embedding_norm", # gemma3n
1847 ),
1848
1849 # NextN/MTP tensors
1850 MODEL_TENSOR.NEXTN_EH_PROJ: (
1851 "model.layers.{bid}.eh_proj",
1852 ),
1853
1854 MODEL_TENSOR.NEXTN_EMBED_TOKENS: (
1855 "model.layers.{bid}.embed_tokens",
1856 ),
1857
1858 MODEL_TENSOR.NEXTN_ENORM: (
1859 "model.layers.{bid}.enorm",
1860 ),
1861
1862 MODEL_TENSOR.NEXTN_HNORM: (
1863 "model.layers.{bid}.hnorm",
1864 ),
1865
1866 MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: (
1867 "model.layers.{bid}.shared_head.head",
1868 ),
1869
1870 MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: (
1871 "model.layers.{bid}.shared_head.norm",
1872 ),
1873 }
1874
1875 # architecture-specific block mappings
1876 arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
1877 MODEL_ARCH.ARCTIC: {
1878 MODEL_TENSOR.FFN_NORM: (
1879 "model.layers.{bid}.residual_layernorm",
1880 ),
1881 MODEL_TENSOR.FFN_NORM_EXP: (
1882 "model.layers.{bid}.post_attention_layernorm",
1883 ),
1884 },
1885 }
1886
1887 mapping: dict[str, tuple[MODEL_TENSOR, str]]
1888
1889 def __init__(self, arch: MODEL_ARCH, n_blocks: int):
1890 self.mapping = {}
1891 for tensor, keys in self.mappings_cfg.items():
1892 if tensor not in MODEL_TENSORS[arch]:
1893 continue
1894 tensor_name = TENSOR_NAMES[tensor]
1895 self.mapping[tensor_name] = (tensor, tensor_name)
1896 for key in keys:
1897 self.mapping[key] = (tensor, tensor_name)
1898 if arch in self.arch_block_mappings_cfg:
1899 self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
1900 for bid in range(n_blocks):
1901 for tensor, keys in self.block_mappings_cfg.items():
1902 if tensor not in MODEL_TENSORS[arch]:
1903 continue
1904
1905 tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
1906 self.mapping[tensor_name] = (tensor, tensor_name)
1907 for key in keys:
1908 key = key.format(bid = bid)
1909 self.mapping[key] = (tensor, tensor_name)
1910
1911 def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
1912 result = self.mapping.get(key)
1913 if result is not None:
1914 return result
1915 for suffix in try_suffixes:
1916 if key.endswith(suffix):
1917 result = self.mapping.get(key[:-len(suffix)])
1918 if result is not None:
1919 return result[0], result[1] + suffix
1920 return None
1921
1922 def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
1923 result = self.get_type_and_name(key, try_suffixes = try_suffixes)
1924 if result is None:
1925 return None
1926 return result[1]
1927
1928 def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
1929 result = self.get_type_and_name(key, try_suffixes = try_suffixes)
1930 if result is None:
1931 return None
1932 return result[0]
1933
1934 def __getitem__(self, key: str) -> str:
1935 try:
1936 return self.mapping[key][1]
1937 except KeyError:
1938 raise KeyError(key)
1939
1940 def __contains__(self, key: str) -> bool:
1941 return key in self.mapping
1942
1943 def __repr__(self) -> str:
1944 return repr(self.mapping)
1945
1946
1947def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
1948 return TensorNameMap(arch, n_blocks)