1from __future__ import annotations
   2
   3from typing import Sequence
   4
   5from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES
   6
   7
   8class TensorNameMap:
   9    mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
  10        # Token embeddings
  11        MODEL_TENSOR.TOKEN_EMBD: (
  12            "gpt_neox.embed_in",                         # gptneox
  13            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone
  14            "transformer.word_embeddings",               # falcon
  15            "word_embeddings",                           # bloom
  16            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 plamo2 granite-hybrid
  17            "embed_tokens",                              # embeddinggemma
  18            "tok_embeddings",                            # llama-pth
  19            "embeddings.word_embeddings",                # bert nomic-bert
  20            "embeddings.tok_embeddings",                 # modern-bert
  21            "language_model.embedding.word_embeddings",  # persimmon
  22            "wte",                                       # gpt2
  23            "transformer.embd.wte",                      # phi2
  24            "model.tok_embeddings",                      # internlm2
  25            "model.embedding",                           # mamba-qbert
  26            "backbone.embedding",                        # mamba
  27            "backbone.embeddings",                       # mamba-hf
  28            "transformer.in_out_embed",                  # Grok
  29            "embedding.word_embeddings",                 # chatglm
  30            "transformer.token_embeddings",              # openelm
  31            "shared",                                    # t5
  32            "rwkv.embeddings",                           # rwkv6
  33            "model.embeddings",                          # rwkv7
  34            "model.word_embeddings",                     # bailingmoe
  35            "language_model.model.embed_tokens",         # llama4
  36            "encoder",                                   # neobert
  37            "model.transformer.wte",                     # llada
  38            "embed_tokens",                              # qwen3-embedding
  39        ),
  40
  41        # Token type embeddings
  42        MODEL_TENSOR.TOKEN_TYPES: (
  43            "embeddings.token_type_embeddings",  # bert nomic-bert
  44        ),
  45
  46        # Normalization of token embeddings
  47        MODEL_TENSOR.TOKEN_EMBD_NORM: (
  48            "word_embeddings_layernorm",  # bloom
  49            "embeddings.LayerNorm",       # bert
  50            "embeddings.norm",            # modern-bert
  51            "emb_ln",                     # nomic-bert
  52            "transformer.norm",           # openelm
  53            "rwkv.blocks.0.pre_ln",       # rwkv
  54            "rwkv.blocks.0.pre_ln",       # rwkv6
  55            "model.pre_ln",               # rwkv7
  56            "model.layers.0.pre_norm",    # rwkv7
  57            "backbone.norm",              # wavtokenizer
  58            "model.embedding_norm",       # lfm2
  59        ),
  60
  61        # Position embeddings
  62        MODEL_TENSOR.POS_EMBD: (
  63            "transformer.wpe",                 # gpt2
  64            "embeddings.position_embeddings",  # bert
  65            "wpe",                             # gpt2
  66        ),
  67
  68        # Output
  69        MODEL_TENSOR.OUTPUT: (
  70            "embed_out",                 # gptneox
  71            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe plamo2
  72            "output",                    # llama-pth bloom internlm2
  73            "word_embeddings_for_head",  # persimmon
  74            "lm_head.linear",            # phi2
  75            "output_layer",              # chatglm
  76            "head",                      # rwkv
  77            "head.out",                  # wavtokenizer
  78            "lm_head",                   # llama4
  79            "model.transformer.ff_out",  # llada
  80            "head.decoder",              # modern-bert
  81        ),
  82        MODEL_TENSOR.DENSE_2_OUT: (
  83            "dense_2_out",  # embeddinggemma
  84        ),
  85        MODEL_TENSOR.DENSE_3_OUT: (
  86            "dense_3_out",  # embeddinggemma
  87        ),
  88        # Output norm
  89        MODEL_TENSOR.OUTPUT_NORM: (
  90            "gpt_neox.final_layer_norm",               # gptneox
  91            "transformer.ln_f",                        # gpt2 gpt-j falcon jais exaone
  92            "model.norm",                              # llama-hf baichuan internlm2 olmoe olmo2 phimoe plamo2
  93            "norm",                                    # llama-pth
  94            "transformer.norm_f",                      # mpt dbrx
  95            "ln_f",                                    # refact bloom qwen gpt2
  96            "language_model.encoder.final_layernorm",  # persimmon
  97            "model.final_layernorm",                   # persimmon
  98            "lm_head.ln",                              # phi2
  99            "model.norm_f",                            # mamba-qbert
 100            "backbone.norm_f",                         # mamba
 101            "transformer.rms_norm",                    # Grok
 102            "encoder.final_layernorm",                 # chatglm
 103            "transformer.norm",                        # openelm
 104            "model.norm",                              # nemotron
 105            "rwkv.ln_out",                             # rwkv6
 106            "model.ln_out",                            # rwkv7
 107            "backbone.final_layer_norm",               # wavtokenizer
 108            "model.norm",                              # llama4
 109            "model.transformer.ln_f",                  # llada
 110            "final_norm",                              # modern-bert
 111            "model.norm",                              # cogvlm
 112        ),
 113
 114        # Rope frequencies
 115        MODEL_TENSOR.ROPE_FREQS: (
 116            "rope.freqs",  # llama-pth
 117            "rotary_pos_emb.inv_freq",  # chatglm
 118        ),
 119
 120        MODEL_TENSOR.ROPE_FACTORS_LONG: (),
 121        MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
 122
 123        MODEL_TENSOR.CONV1D: (
 124            "backbone.embed", # roberta
 125        ),
 126
 127        MODEL_TENSOR.V_MM_EMBEDDING: (
 128            "model.embed_vision.embedding", # gemma3n
 129        ),
 130        MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
 131            "model.embed_vision.hard_embedding_norm", # gemma3n
 132        ),
 133        MODEL_TENSOR.V_MM_INP_PROJ: (
 134            "model.embed_vision.embedding_projection", # gemma3n
 135        ),
 136        MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
 137            "model.embed_vision.soft_embedding_norm", # gemma3n
 138        ),
 139        MODEL_TENSOR.V_ENC_CONV_STEM: (
 140            "model.vision_tower.timm_model.conv_stem.conv", # gemma3n
 141        ),
 142        MODEL_TENSOR.V_ENC_CONV_STEM_NORM: (
 143            "model.vision_tower.timm_model.conv_stem.bn", # gemma3n
 144        ),
 145        MODEL_TENSOR.V_ENC_MSFA_EXP: (
 146            "model.vision_tower.timm_model.msfa.ffn.pw_exp.conv", # gemma3n
 147        ),
 148        MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: (
 149            "model.vision_tower.timm_model.msfa.ffn.pw_exp.bn", # gemma3n
 150        ),
 151        MODEL_TENSOR.V_ENC_MSFA_PROJ: (
 152            "model.vision_tower.timm_model.msfa.ffn.pw_proj.conv", # gemma3n
 153        ),
 154        MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: (
 155            "model.vision_tower.timm_model.msfa.ffn.pw_proj.bn", # gemma3n
 156        ),
 157        MODEL_TENSOR.V_ENC_MSFA_NORM: (
 158            "model.vision_tower.timm_model.msfa.norm", # gemma3n
 159        ),
 160    }
 161
 162    block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
 163        # Attention norm
 164        MODEL_TENSOR.ATTN_NORM: (
 165            "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
 166            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact qwen jais exaone
 167            "transformer.blocks.{bid}.norm_1",                      # mpt
 168            "transformer.h.{bid}.input_layernorm",                  # falcon7b
 169            "h.{bid}.input_layernorm",                              # bloom
 170            "transformer.h.{bid}.ln_mlp",                           # falcon40b
 171            "model.layers.{bid}.input_layernorm",                   # llama-hf nemotron olmoe phimoe granite-hybrid
 172            "layers.{bid}.attention_norm",                          # llama-pth
 173            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
 174            "model.layers.{bid}.ln1",                               # yi
 175            "h.{bid}.ln_1",                                         # gpt2
 176            "transformer.h.{bid}.ln",                               # phi2
 177            "model.layers.layers.{bid}.norm",                       # plamo
 178            "model.layers.layers.{bid}.pre_mixer_norm",             # plamo2
 179            "model.layers.{bid}.attention_norm",                    # internlm2
 180            "model.layers.{bid}.norm",                              # mamba-qbert
 181            "backbone.layers.{bid}.norm",                           # mamba
 182            "transformer.decoder_layer.{bid}.rms_norm",             # Grok
 183            "model.layers.{bid}.pre_attn_norm",                     # grok-2
 184            "transformer.blocks.{bid}.norm_attn_norm.norm_1",       # dbrx
 185            "encoder.layers.{bid}.input_layernorm",                 # chatglm
 186            "transformer.layers.{bid}.attn_norm",                   # openelm
 187            "rwkv.blocks.{bid}.ln1",                                # rwkv6
 188            "model.layers.{bid}.ln1",                               # rwkv7
 189            "model.layers.{bid}.input_layernorm",                   # llama4
 190            "layers.{bid}.input_layernorm",                         # embeddinggemma
 191            "transformer_encoder.{bid}.attention_norm",             # neobert
 192            "layers.{bid}.attn_norm",                               # modern-bert
 193            "model.layers.{bid}.operator_norm",                     # lfm2
 194            "model.transformer.blocks.{bid}.attn_norm",             # llada
 195            "layers.{bid}.input_layernorm",                         # qwen3-embedding
 196            "model.layers.{bid}.attention_layernorm",               # apertus
 197            "model.layers.{bid}.pre_attention_layernorm",           # kormo
 198        ),
 199
 200        # Attention norm 2
 201        MODEL_TENSOR.ATTN_NORM_2: (
 202            "transformer.h.{bid}.ln_attn",                  # falcon40b
 203            "encoder.layer.{bid}.layer_norm_1",             # jina-v2-code
 204            "rwkv.blocks.{bid}.ln2",                        # rwkv6
 205            "model.layers.{bid}.ln2",                       # rwkv7
 206            "model.layers.{bid}.post_attention_layernorm",  # cogvlm
 207        ),
 208
 209        # Attention query-key-value
 210        MODEL_TENSOR.ATTN_QKV: (
 211            "gpt_neox.layers.{bid}.attention.query_key_value",                     # gptneox
 212            "transformer.h.{bid}.attn.c_attn",                                     # gpt2 qwen jais
 213            "transformer.blocks.{bid}.attn.Wqkv",                                  # mpt
 214            "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv",                   # dbrx
 215            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
 216            "h.{bid}.self_attention.query_key_value",                              # bloom
 217            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
 218            "model.layers.{bid}.self_attn.query_key_value",                        # persimmon
 219            "model.layers.{bid}.attention.query_key_value",                        # bailingmoe2
 220            "h.{bid}.attn.c_attn",                                                 # gpt2
 221            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
 222            "encoder.layers.{bid}.attn.Wqkv",                                      # nomic-bert
 223            "encoder.layers.{bid}.mixer.Wqkv",                                     # jina
 224            "model.layers.{bid}.self_attn.qkv_proj",                               # phi3
 225            "model.layers.layers.{bid}.mixer.qkv_proj",                            # plamo2
 226            "encoder.layers.{bid}.self_attention.query_key_value",                 # chatglm
 227            "transformer.layers.{bid}.attn.qkv_proj",                              # openelm
 228            "transformer_encoder.{bid}.qkv",                                       # neobert
 229            "layers.{bid}.attn.Wqkv",                                              # modern-bert
 230            "model.layers.{bid}.self_attn.language_expert_query_key_value",        # cogvlm
 231            "model.layers.{bid}.linear_attn.in_proj_qkv",                          # qwen3.5
 232        ),
 233
 234        # Attention query
 235        MODEL_TENSOR.ATTN_Q: (
 236            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe olmo2 phimoe
 237            "layers.{bid}.self_attn.q_proj",                             # embeddinggemma
 238            "model.layers.{bid}.self_attn.q_proj_no_perm",               # llama-custom
 239            "layers.{bid}.attention.wq",                                 # llama-pth
 240            "encoder.layer.{bid}.attention.self.query",                  # bert
 241            "transformer.layer.{bid}.attention.q_lin",                   # distillbert
 242            "transformer.h.{bid}.attn.q_proj",                           # gpt-j
 243            "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
 244            "model.layers.{bid}.attention.wq",                           # internlm2
 245            "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
 246            "transformer.h.{bid}.attn.attention.q_proj",                 # exaone
 247            "model.layers.{bid}.self_attn.q_proj",                       # llama4
 248            "model.transformer.blocks.{bid}.q_proj",                     # llada
 249            "layers.{bid}.self_attn.q_proj",                             # qwen3-embedding
 250            "backbone.layers.{bid}.mixer.q_proj",                        # nemotron-h
 251        ),
 252
 253        # Attention key
 254        MODEL_TENSOR.ATTN_K: (
 255            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe olmo2 phimoe
 256            "layers.{bid}.self_attn.k_proj",                           # embeddinggemma
 257            "model.layers.{bid}.self_attn.k_proj_no_perm",             # llama-custom
 258            "layers.{bid}.attention.wk",                               # llama-pth
 259            "encoder.layer.{bid}.attention.self.key",                  # bert
 260            "transformer.layer.{bid}.attention.k_lin",                 # distillbert
 261            "transformer.h.{bid}.attn.k_proj",                         # gpt-j
 262            "transformer.h.{bid}.attn.k",                              # refact
 263            "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
 264            "model.layers.{bid}.attention.wk",                         # internlm2
 265            "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
 266            "transformer.h.{bid}.attn.attention.k_proj",               # exaone
 267            "model.layers.{bid}.self_attn.k_proj",                     # llama4
 268            "model.transformer.blocks.{bid}.k_proj",                   # llada
 269            "layers.{bid}.self_attn.k_proj",                           # qwen3-embedding
 270            "backbone.layers.{bid}.mixer.k_proj",                      # nemotron-h
 271        ),
 272
 273        # Attention value
 274        MODEL_TENSOR.ATTN_V: (
 275            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo2 phimoe
 276            "layers.{bid}.self_attn.v_proj",                             # embeddinggemma
 277            "layers.{bid}.attention.wv",                                 # llama-pth
 278            "encoder.layer.{bid}.attention.self.value",                  # bert
 279            "transformer.layer.{bid}.attention.v_lin",                   # distillbert
 280            "transformer.h.{bid}.attn.v_proj",                           # gpt-j
 281            "transformer.h.{bid}.attn.v",                                # refact
 282            "model.layers.layers.{bid}.self_attn.v_proj",                # plamo
 283            "model.layers.{bid}.attention.wv",                           # internlm2
 284            "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
 285            "transformer.h.{bid}.attn.attention.v_proj",                 # exaone
 286            "model.layers.{bid}.self_attn.v_proj",                       # llama4
 287            "model.transformer.blocks.{bid}.v_proj",                     # llada
 288            "layers.{bid}.self_attn.v_proj",                             # qwen3-embedding
 289            "backbone.layers.{bid}.mixer.v_proj",                        # nemotron-h
 290        ),
 291
 292        # Attention output
 293        MODEL_TENSOR.ATTN_OUT: (
 294            "gpt_neox.layers.{bid}.attention.dense",                        # gptneox
 295            "transformer.h.{bid}.attn.c_proj",                              # gpt2 refact qwen jais
 296            "transformer.blocks.{bid}.attn.out_proj",                       # mpt
 297            "transformer.h.{bid}.self_attention.dense",                     # falcon
 298            "h.{bid}.self_attention.dense",                                 # bloom
 299            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo2 phimoe
 300            "layers.{bid}.self_attn.o_proj",                                # embeddinggemma
 301            "model.layers.{bid}.self_attn.out_proj",                        # lfm2
 302            "model.layers.{bid}.self_attn.linear_attn",                     # deci
 303            "layers.{bid}.attention.wo",                                    # llama-pth
 304            "encoder.layer.{bid}.attention.output.dense",                   # bert
 305            "layers.{bid}.attn.Wo",                                         # modern-bert
 306            "transformer.layer.{bid}.attention.out_lin",                    # distillbert
 307            "transformer.h.{bid}.attn.out_proj",                            # gpt-j
 308            "language_model.encoder.layers.{bid}.self_attention.dense",     # persimmon
 309            "model.layers.{bid}.self_attn.dense",                           # persimmon
 310            "model.layers.{bid}.attention.dense",                           # bailingmoe2
 311            "h.{bid}.attn.c_proj",                                          # gpt2
 312            "transformer.h.{bid}.mixer.out_proj",                           # phi2
 313            "model.layers.layers.{bid}.self_attn.o_proj",                   # plamo
 314            "model.layers.layers.{bid}.mixer.o_proj",                       # plamo2
 315            "model.layers.{bid}.attention.wo",                              # internlm2
 316            "encoder.layers.{bid}.attn.out_proj",                           # nomic-bert
 317            "encoder.layers.{bid}.mixer.out_proj",                          # jina
 318            "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
 319            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",        # dbrx
 320            "encoder.layers.{bid}.self_attention.dense",                    # chatglm
 321            "transformer.layers.{bid}.attn.out_proj",                       # openelm
 322            "transformer.h.{bid}.attn.attention.out_proj",                  # exaone
 323            "model.layers.{bid}.self_attn.o_proj",                          # llama4
 324            "transformer_encoder.{bid}.wo",                                 # neobert
 325            "model.transformer.blocks.{bid}.attn_out",                      # llada
 326            "layers.{bid}.self_attn.o_proj",                                # qwen3-embedding
 327            "backbone.layers.{bid}.mixer.o_proj",                           # nemotron-h
 328            "model.layers.{bid}.self_attn.language_expert_dense",           # cogvlm
 329        ),
 330
 331        # Attention output norm
 332        MODEL_TENSOR.ATTN_OUT_NORM: (
 333            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
 334            "transformer.layer.{bid}.sa_layer_norm",           # distillbert
 335            "encoder.layers.{bid}.norm1",                      # nomic-bert
 336            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
 337            "model.layers.{bid}.post_attn_norm",               # grok-2
 338            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
 339        ),
 340
 341        MODEL_TENSOR.ATTN_POST_NORM: (
 342            "model.layers.{bid}.post_attention_layernorm",       # gemma2 olmo2    # ge
 343            "layers.{bid}.post_attention_layernorm",             # embeddinggemma
 344            "model.layers.{bid}.post_self_attn_layernorm",       # glm-4-0414
 345            "model.layers.layers.{bid}.post_mixer_norm.weight",  # plamo2
 346        ),
 347
 348        # Rotary embeddings
 349        MODEL_TENSOR.ATTN_ROT_EMBD: (
 350            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # llama-hf
 351            "layers.{bid}.attention.inner_attention.rope.freqs",       # llama-pth
 352            "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
 353            "transformer.h.{bid}.attn.rotary_emb.inv_freq",            # codeshell
 354        ),
 355
 356        MODEL_TENSOR.ATTN_SINKS: (
 357            "model.layers.{bid}.self_attn.sinks", # openai-moe
 358            "model.layers.{bid}.self_attn.attention_sink_bias", # mimov2
 359        ),
 360
 361        MODEL_TENSOR.ATTN_GATE: (
 362            "model.layers.{bid}.self_attn.gate_proj", # afmoe
 363            "model.layers.{bid}.linear_attn.in_proj_z",  # qwen3.5
 364            "model.layers.{bid}.self_attn.g_proj",    # step3.5 head-wise attention gate
 365        ),
 366
 367        # Feed-forward norm
 368        MODEL_TENSOR.FFN_NORM: (
 369            "gpt_neox.layers.{bid}.post_attention_layernorm",                # gptneox
 370            "transformer.h.{bid}.ln_2",                                      # gpt2 refact qwen jais exaone
 371            "h.{bid}.post_attention_layernorm",                              # bloom
 372            "transformer.blocks.{bid}.norm_2",                               # mpt
 373            "model.layers.{bid}.post_attention_layernorm",                   # llama-hf nemotron olmoe phimoe
 374            "layers.{bid}.ffn_norm",                                         # llama-pth
 375            "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
 376            "model.layers.{bid}.ln2",                                        # yi
 377            "h.{bid}.ln_2",                                                  # gpt2
 378            "model.layers.{bid}.ffn_norm",                                   # internlm2
 379            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
 380            "model.layers.{bid}.pre_moe_norm",                               # grok-2
 381            "encoder.layers.{bid}.post_attention_layernorm",                 # chatglm
 382            "transformer.layers.{bid}.ffn_norm",                             # openelm
 383            "model.layers.{bid}.pre_ff_layernorm",                           # jamba granite-hybrid
 384            "model.layers.{bid}.pre_moe_layernorm",                          # mini-jamba
 385            "model.layers.{bid}.post_attention_layernorm",                   # llama4
 386            "transformer_encoder.{bid}.ffn_norm",                            # neobert
 387            "model.layers.layers.{bid}.pre_mlp_norm",                        # plamo2
 388            "model.transformer.blocks.{bid}.ff_norm",                        # llada
 389            "layers.{bid}.post_attention_layernorm",                         # qwen3-embedding
 390            "model.layers.{bid}.feedforward_layernorm",                      # apertus
 391            "model.layers.{bid}.pre_mlp_layernorm",                          # kormo
 392            "layers.{bid}.mlp_norm"                                          # modern-bert
 393        ),
 394
 395        # Pre feed-forward norm
 396        MODEL_TENSOR.FFN_PRE_NORM: (
 397            "model.layers.{bid}.pre_feedforward_layernorm", # gemma2
 398            "layers.{bid}.pre_feedforward_layernorm",       # embeddinggemma
 399            "model.layers.{bid}.pre_ff_layernorm.weight",
 400            "model.layers.{bid}.pre_mlp_layernorm",        # afmoe
 401        ),
 402
 403        # Post feed-forward norm
 404        MODEL_TENSOR.FFN_POST_NORM: (
 405            "model.layers.{bid}.post_feedforward_layernorm",  # gemma2 olmo2
 406            "layers.{bid}.post_feedforward_layernorm",        # embeddinggemma
 407            "model.layers.{bid}.post_mlp_layernorm",          # glm-4-0414
 408            "model.layers.layers.{bid}.post_mlp_norm.weight", # plamo2
 409            "model.layers.{bid}.feed_forward.up_proj",
 410            "model.layers.{bid}.post_moe_norm",               # grok-2
 411        ),
 412
 413        MODEL_TENSOR.FFN_GATE_INP: (
 414            "layers.{bid}.feed_forward.gate",                   # mixtral
 415            "model.layers.{bid}.block_sparse_moe.gate",         # mixtral phimoe
 416            "model.layers.{bid}.mlp.gate",                      # qwen2moe olmoe
 417            "transformer.decoder_layer.{bid}.router",           # Grok
 418            "transformer.blocks.{bid}.ffn.router.layer",        # dbrx
 419            "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
 420            "model.layers.{bid}.feed_forward.router",           # llama4 jamba
 421            "encoder.layers.{bid}.mlp.router.layer",            # nomic-bert-moe
 422            "model.layers.{bid}.mlp.router",                    # openai-moe
 423            "model.layers.{bid}.mlp.gate.wg",                   # hunyuan
 424            "model.layers.{bid}.block_sparse_moe.primary_router", # smallthinker
 425            "model.layers.{bid}.feed_forward.gate",               # lfm2moe
 426            "model.layers.{bid}.mlp.router.gate",               # afmoe
 427            "layers.{bid}.gate",                                # mistral-large
 428            "backbone.layers.{bid}.mixer.gate",                 # nemotron-h-moe
 429            "model.layers.{bid}.moe.gate",                      # step3.5
 430        ),
 431
 432        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
 433            "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
 434        ),
 435
 436        MODEL_TENSOR.FFN_EXP_PROBS_B: (
 437            "model.layers.{bid}.mlp.gate.e_score_correction",               # deepseek-v3 dots1
 438            "model.layers.{bid}.mlp.moe_statics.e_score_correction",        # ernie4.5-moe
 439            "model.layers.{bid}.mlp.gate.expert_bias",                      # bailingmoe2
 440            "model.layers.{bid}.mlp.expert_bias",                           # afmoe
 441            "model.layers.{bid}.feed_forward.expert_bias",                  # lfm2moe
 442            "model.layers.{bid}.block_sparse_moe.e_score_correction",       # minimax-m2
 443            "backbone.layers.{bid}.mixer.gate.e_score_correction",          # nemotron-h-moe
 444            "model.layers.{bid}.mlp.e_score_correction",                    # exaone-moe
 445            "model.layers.{bid}.block_sparse_moe.gate.e_score_correction",  # kimi
 446            "model.layers.{bid}.moe.router_bias",                           # step3.5 expert selection bias
 447        ),
 448
 449        # Feed-forward up
 450        MODEL_TENSOR.FFN_UP: (
 451            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",                # gptneox
 452            "transformer.h.{bid}.mlp.c_fc",                           # gpt2 jais
 453            "transformer.blocks.{bid}.ffn.up_proj",                   # mpt
 454            "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
 455            "h.{bid}.mlp.dense_h_to_4h",                              # bloom
 456            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
 457            "layers.{bid}.mlp.up_proj",                               # embeddinggemma
 458            "layers.{bid}.feed_forward.w3",                           # llama-pth
 459            "encoder.layer.{bid}.intermediate.dense",                 # bert
 460            "layers.{bid}.mlp.Wi",                                    # modern-bert
 461            "transformer.layer.{bid}.ffn.lin1",                       # distillbert
 462            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
 463            "transformer.h.{bid}.mlp.linear_3",                       # refact
 464            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
 465            "model.layers.{bid}.mlp.dense_h_to_4h",                   # persimmon
 466            "transformer.h.{bid}.mlp.w1",                             # qwen
 467            "h.{bid}.mlp.c_fc",                                       # gpt2
 468            "transformer.h.{bid}.mlp.fc1",                            # phi2
 469            "model.layers.{bid}.mlp.fc1",                             # phi2
 470            "model.layers.{bid}.mlp.gate_up_proj",                    # phi3 glm-4-0414
 471            "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
 472            "model.layers.layers.{bid}.mlp.gate_up_proj",             # plamo2
 473            "model.layers.{bid}.feed_forward.w3",                     # internlm2
 474            "encoder.layers.{bid}.mlp.fc11",                          # nomic-bert
 475            "encoder.layers.{bid}.mlp.fc1",                           # nomic-bert-moe
 476            "model.layers.{bid}.mlp.c_fc",                            # starcoder2
 477            "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2 (split up/gate, no longer used)
 478            "encoder.layer.{bid}.mlp.gated_layers",                   # jina-bert-v2 (GEGLU)
 479            "encoder.layer.{bid}.mlp.up_gated_layer",                 # jina-v2-code (GEGLU)
 480            "model.layers.{bid}.residual_mlp.w3",                     # arctic
 481            "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
 482            "transformer.h.{bid}.mlp.c_fc_1",                         # exaone
 483            "model.layers.{bid}.feed_forward.up_proj",                # llama4 jamba granite-hybrid
 484            "transformer_encoder.{bid}.ffn.w12",                      # neobert
 485            "model.layers.{bid}.block_sparse_moe.up",                 # smallthinker
 486            "model.transformer.blocks.{bid}.up_proj",                 # llada
 487            "layers.{bid}.mlp.up_proj",                               # qwen3-embedding
 488            "backbone.layers.{bid}.mixer.up_proj",                    # nemotron-h
 489            "model.layers.{bid}.mlp.language_mlp.up_proj",            # cogvlm
 490        ),
 491
 492        MODEL_TENSOR.FFN_UP_EXP: (
 493            "layers.{bid}.feed_forward.experts.w3",                 # mixtral (merged)
 494            "transformer.decoder_layer.{bid}.moe.linear_v",         # Grok (merged)
 495            "transformer.blocks.{bid}.ffn.experts.mlp.v1",          # dbrx
 496            "model.layers.{bid}.mlp.experts.up_proj",               # qwen2moe olmoe (merged) ernie4.5-moe, nemotron-h-moe (merged)
 497            "model.layers.{bid}.block_sparse_moe.experts.w3",       # phimoe (merged)
 498            "model.layers.{bid}.feed_forward.experts.up_proj",      # llama4
 499            "encoder.layers.{bid}.mlp.experts.mlp.w1",              # nomic-bert-moe
 500            "model.layers.{bid}.block_sparse_moe.experts.up", # smallthinker
 501            "model.layers.{bid}.moe.up_proj",                       # step3.5
 502        ),
 503
 504        MODEL_TENSOR.FFN_UP_SHEXP: (
 505            "model.layers.{bid}.mlp.shared_expert.up_proj",          # qwen2moe
 506            "model.layers.{bid}.mlp.shared_experts.up_proj",         # deepseek deepseek2
 507            "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
 508            "model.layers.{bid}.feed_forward.down_proj",
 509            "model.layers.{bid}.mlp.shared_mlp.up_proj",             # hunyuan
 510            "layers.{bid}.shared_experts.w3",                        # mistral-large
 511            "backbone.layers.{bid}.mixer.shared_experts.up_proj",    # nemotron-h-moe
 512            "model.layers.{bid}.block_sparse_moe.shared_experts.up_proj", # kimi
 513            "model.layers.{bid}.share_expert.up_proj",               # step3.5
 514        ),
 515
 516        MODEL_TENSOR.FFN_UP_CHEXP: (
 517            "model.layers.{bid}.mlp.chunk_experts.up_proj",           # grovemoe
 518        ),
 519
 520        # AWQ-activation gate
 521        MODEL_TENSOR.FFN_ACT: (
 522            "transformer.blocks.{bid}.ffn.act",  # mpt
 523        ),
 524
 525        # Feed-forward gate
 526        MODEL_TENSOR.FFN_GATE: (
 527            "model.layers.{bid}.mlp.gate_proj",               # llama-hf refact olmo2
 528            "layers.{bid}.mlp.gate_proj",                     # embeddinggemma
 529            "layers.{bid}.feed_forward.w1",                   # llama-pth
 530            "transformer.h.{bid}.mlp.w2",                     # qwen
 531            "transformer.h.{bid}.mlp.c_fc2",                  # jais
 532            "model.layers.layers.{bid}.mlp.gate_proj",        # plamo
 533            "model.layers.{bid}.feed_forward.w1",             # internlm2
 534            "encoder.layers.{bid}.mlp.fc12",                  # nomic-bert
 535            "encoder.layer.{bid}.mlp.gated_layers_w",         # jina-bert-v2 (split up/gate, no longer used)
 536            "transformer.h.{bid}.mlp.linear_1",               # refact
 537            "model.layers.{bid}.residual_mlp.w1",             # arctic
 538            "transformer.h.{bid}.mlp.c_fc_0",                 # exaone
 539            "model.layers.{bid}.feed_forward.gate_proj",      # llama4 jamba granite-hybrid
 540            "model.transformer.blocks.{bid}.ff_proj",         # llada
 541            "layers.{bid}.mlp.gate_proj",                     # qwen3-embedding
 542            "model.layers.{bid}.mlp.language_mlp.gate_proj",  # cogvlm
 543        ),
 544
 545        MODEL_TENSOR.FFN_GATE_EXP: (
 546            "layers.{bid}.feed_forward.experts.w1",                     # mixtral (merged)
 547            "transformer.decoder_layer.{bid}.moe.linear",               # Grok (merged)
 548            "transformer.blocks.{bid}.ffn.experts.mlp.w1",              # dbrx
 549            "model.layers.{bid}.mlp.experts.gate_proj",                 # qwen2moe olmoe (merged) ernie4.5-moe
 550            "model.layers.{bid}.block_sparse_moe.experts.w1",           # phimoe (merged)
 551            "model.layers.{bid}.feed_forward.experts.gate_proj",        # llama4
 552            "model.layers.{bid}.block_sparse_moe.experts.gate",         # smallthinker
 553            "model.layers.{bid}.moe.gate_proj",                         # step3.5
 554        ),
 555
 556        MODEL_TENSOR.FFN_GATE_SHEXP: (
 557            "model.layers.{bid}.mlp.shared_expert.gate_proj",          # qwen2moe
 558            "model.layers.{bid}.mlp.shared_experts.gate_proj",         # deepseek deepseek2
 559            "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
 560            "model.layers.{bid}.mlp.shared_mlp.gate_proj",             # hunyuan
 561            "layers.{bid}.shared_experts.w1",                          # mistral-large
 562            "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi
 563            "model.layers.{bid}.share_expert.gate_proj",               # step3.5
 564        ),
 565
 566        MODEL_TENSOR.FFN_GATE_CHEXP: (
 567            "model.layers.{bid}.mlp.chunk_experts.gate_proj",           # grovemoe
 568        ),
 569
 570        # Feed-forward down
 571        MODEL_TENSOR.FFN_DOWN: (
 572            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",                # gptneox
 573            "transformer.h.{bid}.mlp.c_proj",                         # gpt2 refact qwen jais
 574            "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
 575            "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
 576            "h.{bid}.mlp.dense_4h_to_h",                              # bloom
 577            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
 578            "layers.{bid}.mlp.down_proj",                             # embeddinggemma
 579            "layers.{bid}.feed_forward.w2",                           # llama-pth
 580            "encoder.layer.{bid}.output.dense",                       # bert
 581            "layers.{bid}.mlp.Wo",                                    # modern-bert
 582            "transformer.layer.{bid}.ffn.lin2",                       # distillbert
 583            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
 584            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
 585            "model.layers.{bid}.mlp.dense_4h_to_h",                   # persimmon
 586            "h.{bid}.mlp.c_proj",                                     # gpt2
 587            "transformer.h.{bid}.mlp.fc2",                            # phi2
 588            "model.layers.{bid}.mlp.fc2",                             # phi2
 589            "model.layers.layers.{bid}.mlp.down_proj",                # plamo
 590            "model.layers.{bid}.feed_forward.w2",                     # internlm2
 591            "encoder.layers.{bid}.mlp.fc2",                           # nomic-bert
 592            "model.layers.{bid}.mlp.c_proj",                          # starcoder2
 593            "encoder.layer.{bid}.mlp.wo",                             # jina-bert-v2
 594            "transformer.layers.{bid}.ffn.proj_2",                    # openelm
 595            "model.layers.{bid}.residual_mlp.w2",                     # arctic
 596            "encoder.layer.{bid}.mlp.down_layer",                     # jina-bert-v2
 597            "encoder.layers.{bid}.mlp.dense_4h_to_h",                 # chatglm
 598            "model.layers.h.{bid}.mlp.c_proj",                        # exaone
 599            "model.layers.{bid}.feed_forward.down_proj",              # llama4 jamba granite-hybrid
 600            "transformer_encoder.{bid}.ffn.w3",                       # neobert
 601            "model.layers.{bid}.block_sparse_moe.down",               # smallthinker
 602            "model.transformer.blocks.{bid}.ff_out",                  # llada
 603            "layers.{bid}.mlp.down_proj",                             # qwen3-embedding
 604            "backbone.layers.{bid}.mixer.down_proj",                  # nemotron-h
 605            "model.layers.{bid}.mlp.language_mlp.down_proj",          # cogvlm
 606        ),
 607
 608        MODEL_TENSOR.FFN_DOWN_EXP: (
 609            "layers.{bid}.feed_forward.experts.w2",                 # mixtral (merged)
 610            "transformer.decoder_layer.{bid}.moe.linear_1",         # Grok (merged)
 611            "transformer.blocks.{bid}.ffn.experts.mlp.w2",          # dbrx
 612            "model.layers.{bid}.mlp.experts.down_proj",             # qwen2moe olmoe (merged) ernie4.5-moe nemotron-h-moe (merged)
 613            "model.layers.{bid}.block_sparse_moe.output_linear",    # granitemoe
 614            "model.layers.{bid}.block_sparse_moe.experts.w2",       # phimoe (merged)
 615            "model.layers.{bid}.feed_forward.experts.down_proj",    # llama4
 616            "encoder.layers.{bid}.mlp.experts.mlp.w2",              # nomic-bert-moe
 617            "model.layers.{bid}.block_sparse_moe.experts.down",     # smallthinker
 618            "model.layers.{bid}.moe.down_proj",                     # step3.5
 619        ),
 620
 621        MODEL_TENSOR.FFN_DOWN_SHEXP: (
 622            "model.layers.{bid}.mlp.shared_expert.down_proj",          # qwen2moe
 623            "model.layers.{bid}.mlp.shared_experts.down_proj",         # deepseek deepseek2
 624            "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
 625            "model.layers.{bid}.shared_mlp.output_linear",             # granitemoe
 626            "model.layers.{bid}.mlp.shared_mlp.down_proj",             # hunyuan
 627            "layers.{bid}.shared_experts.w2",                          # mistral-large
 628            "backbone.layers.{bid}.mixer.shared_experts.down_proj",    # nemotron-h-moe
 629            "model.layers.{bid}.block_sparse_moe.shared_experts.down_proj", # kimi
 630            "model.layers.{bid}.share_expert.down_proj",               # step3.5
 631        ),
 632
 633        MODEL_TENSOR.FFN_DOWN_CHEXP: (
 634            "model.layers.{bid}.mlp.chunk_experts.down_proj",           # grovemoe
 635        ),
 636
 637        MODEL_TENSOR.ATTN_Q_NORM: (
 638            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
 639            "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
 640            "model.layers.{bid}.self_attn.query_layernorm",                   # hunyuan
 641            "model.layers.{bid}.attention.query_layernorm",                   # bailingmoe2
 642            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon olmo2
 643            "layers.{bid}.self_attn.q_norm",                                  # embeddinggemma
 644            "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
 645            "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
 646            "transformer.layers.{bid}.attn.q_norm",                           # openelm
 647            "model.layers.layers.{bid}.mixer.q",                              # plamo2
 648            "model.layers.layers.{bid}.mixer.q_norm",                         # plamo3
 649            "layers.{bid}.self_attn.q_norm",                                  # qwen3-embedding
 650            "model.layers.{bid}.attention.query_layernorm",                   # apertus
 651        ),
 652
 653        MODEL_TENSOR.ATTN_K_NORM: (
 654            "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
 655            "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
 656            "model.layers.{bid}.self_attn.key_layernorm",                     # hunyuan
 657            "model.layers.{bid}.attention.key_layernorm",                     # bailingmoe2
 658            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon olmo2
 659            "layers.{bid}.self_attn.k_norm",                                  # embeddinggemma
 660            "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
 661            "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
 662            "transformer.layers.{bid}.attn.k_norm",                           # openelm
 663            "model.layers.layers.{bid}.mixer.k",                              # plamo2
 664            "model.layers.layers.{bid}.mixer.k_norm",                         # plamo3
 665            "layers.{bid}.self_attn.k_norm",                                  # qwen3-embedding
 666            "model.layers.{bid}.attention.key_layernorm",                     # apertus
 667        ),
 668
 669        MODEL_TENSOR.ROPE_FREQS: (
 670            "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq",  # persimmon
 671        ),
 672
 673        MODEL_TENSOR.LAYER_OUT_NORM: (
 674            "encoder.layer.{bid}.output.LayerNorm",         # bert
 675            "transformer.layer.{bid}.output_layer_norm",    # distillbert
 676            "encoder.layers.{bid}.norm2",                   # nomic-bert
 677            "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
 678            "encoder.layer.{bid}.mlp.layernorm",            # jina-bert-v2
 679            "encoder.layer.{bid}.layer_norm_2",             # jina-v2-code
 680            "model.layers.{bid}.final_layernorm",           # bailingmoe2
 681        ),
 682
 683        MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
 684            "model.embed_tokens_per_layer",  # gemma3n
 685        ),
 686
 687        MODEL_TENSOR.PER_LAYER_MODEL_PROJ: (
 688            "model.per_layer_model_projection",  # gemma3n
 689        ),
 690
 691        MODEL_TENSOR.PER_LAYER_PROJ_NORM: (
 692            "model.per_layer_projection_norm",  # gemma3n
 693        ),
 694
 695        MODEL_TENSOR.ALTUP_PROJ: (
 696            "model.altup_projections",  # gemma3n
 697        ),
 698
 699        MODEL_TENSOR.ALTUP_UNEMBD_PROJ: (
 700            "model.altup_unembed_projections",  # gemma3n
 701        ),
 702
 703        MODEL_TENSOR.PER_LAYER_INP_GATE: (
 704            "model.layers.{bid}.per_layer_input_gate",  # gemma3n
 705        ),
 706
 707        MODEL_TENSOR.PER_LAYER_PROJ: (
 708            "model.layers.{bid}.per_layer_projection",  # gemma3n
 709        ),
 710
 711        MODEL_TENSOR.PER_LAYER_POST_NORM: (
 712            "model.layers.{bid}.post_per_layer_input_norm",  # gemma3n
 713        ),
 714
 715        MODEL_TENSOR.ALTUP_CORRECT_COEF: (
 716            "model.layers.{bid}.altup.correction_coefs",  # gemma3n
 717        ),
 718
 719        MODEL_TENSOR.ALTUP_CORRECT_SCALE: (
 720            "model.layers.{bid}.altup.correct_output_scale",  # gemma3n
 721        ),
 722
 723        MODEL_TENSOR.ALTUP_PREDICT_COEF: (
 724            "model.layers.{bid}.altup.prediction_coefs",  # gemma3n
 725        ),
 726
 727        MODEL_TENSOR.ALTUP_ROUTER: (
 728            "model.layers.{bid}.altup.modality_router",  # gemma3n
 729        ),
 730
 731        MODEL_TENSOR.ALTUP_ROUTER_NORM: (
 732            "model.layers.{bid}.altup.router_norm",  # gemma3n
 733        ),
 734
 735        MODEL_TENSOR.LAUREL_L: (
 736            "model.layers.{bid}.laurel.linear_left",  # gemma3n
 737        ),
 738
 739        MODEL_TENSOR.LAUREL_R: (
 740            "model.layers.{bid}.laurel.linear_right",  # gemma3n
 741        ),
 742
 743        MODEL_TENSOR.LAUREL_POST_NORM: (
 744            "model.layers.{bid}.laurel.post_laurel_norm",  # gemma3n
 745        ),
 746
 747        MODEL_TENSOR.SSM_IN: (
 748            "model.layers.{bid}.in_proj",                   # mamba-hf
 749            "backbone.layers.{bid}.mixer.in_proj",          # mamba
 750            "model.layers.{bid}.mamba.in_proj",             # jamba falcon-h1 granite-hybrid
 751            "model.layers.layers.{bid}.mixer.in_proj",      # plamo2
 752            "model.layers.{bid}.linear_attn.in_proj_qkvz",  # qwen3next
 753        ),
 754
 755        MODEL_TENSOR.SSM_CONV1D: (
 756            "model.layers.{bid}.conv1d",               # mamba-hf
 757            "backbone.layers.{bid}.mixer.conv1d",      # mamba
 758            "model.layers.{bid}.mamba.conv1d",         # jamba falcon-h1 granite-hybrid
 759            "model.layers.layers.{bid}.mixer.conv1d",  # plamo2
 760            "model.layers.{bid}.linear_attn.conv1d",   # qwen3next
 761        ),
 762
 763        MODEL_TENSOR.SSM_X: (
 764            "model.layers.{bid}.x_proj",                  # mamba-hf
 765            "backbone.layers.{bid}.mixer.x_proj",         # mamba
 766            "model.layers.{bid}.mamba.x_proj",            # jamba
 767            "model.layers.layers.{bid}.mixer.bcdt_proj",  # plamo2
 768        ),
 769
 770        MODEL_TENSOR.SSM_DT: (
 771            "model.layers.{bid}.dt_proj",               # mamba-hf
 772            "backbone.layers.{bid}.mixer.dt_proj",      # mamba
 773            "model.layers.{bid}.mamba.dt_proj",         # jamba falcon-h1 granite-hybrid
 774            "model.layers.layers.{bid}.mixer.dt_proj",  # plamo2
 775            "model.layers.{bid}.linear_attn.dt_proj",   # qwen3next
 776            "backbone.layers.{bid}.mixer.dt",           # nemotron-h-moe
 777            "model.layers.{bid}.self_attn.dt_proj",     # kimi
 778        ),
 779
 780        MODEL_TENSOR.SSM_DT_NORM: (
 781            "model.layers.layers.{bid}.mixer.dt_norm.weight",  # plamo2
 782            "model.layers.{bid}.mamba.dt_layernorm",  # jamba
 783        ),
 784
 785        MODEL_TENSOR.SSM_A: (
 786            "model.layers.{bid}.A_log",               # mamba-hf
 787            "backbone.layers.{bid}.mixer.A_log",      # mamba
 788            "model.layers.{bid}.mamba.A_log",         # jamba falcon-h1 granite-hybrid
 789            "model.layers.layers.{bid}.mixer.A_log",  # plamo2
 790            "model.layers.{bid}.linear_attn.A_log",   # qwen3next
 791            "model.layers.{bid}.self_attn.A_log",     # kimi
 792        ),
 793
 794        MODEL_TENSOR.SSM_B_NORM: (
 795            "model.layers.{bid}.mamba.b_layernorm",           # jamba
 796            "model.layers.{bid}.mamba.B_layernorm",           # mini-jamba
 797            "model.layers.layers.{bid}.mixer.B_norm.weight",  # plamo2
 798        ),
 799
 800        MODEL_TENSOR.SSM_C_NORM: (
 801            "model.layers.{bid}.mamba.c_layernorm",           # jamba
 802            "model.layers.{bid}.mamba.C_layernorm",           # mini-jamba
 803            "model.layers.layers.{bid}.mixer.C_norm.weight",  # plamo2
 804        ),
 805
 806        MODEL_TENSOR.SSM_D: (
 807            "model.layers.{bid}.D",               # mamba-hf
 808            "backbone.layers.{bid}.mixer.D",      # mamba
 809            "model.layers.{bid}.mamba.D",         # jamba falcon-h1 granite-hybrid
 810            "model.layers.layers.{bid}.mixer.D",  # plamo2
 811        ),
 812
 813        MODEL_TENSOR.SSM_NORM: (
 814            "model.layers.{bid}.mamba.norm",        # falcon-h1 granite-hybrid
 815            "model.layers.{bid}.linear_attn.norm",  # qwen3next
 816            "backbone.layers.{bid}.mixer.norm",     # mamba2
 817            "model.layers.{bid}.self_attn.o_norm",  # kimi
 818        ),
 819
 820        MODEL_TENSOR.SSM_OUT: (
 821            "model.layers.{bid}.out_proj",               # mamba-hf
 822            "backbone.layers.{bid}.mixer.out_proj",      # mamba
 823            "model.layers.{bid}.mamba.out_proj",         # jamba falcon-h1 granite-hybrid
 824            "model.layers.{bid}.linear_attn.out_proj",   # qwen3next
 825            "model.layers.layers.{bid}.mixer.out_proj",  # plamo2
 826        ),
 827
 828        MODEL_TENSOR.SSM_ALPHA: (
 829            "model.layers.{bid}.linear_attn.in_proj_a",  # qwen3.5
 830        ),
 831
 832        MODEL_TENSOR.SSM_BETA_ALPHA: (
 833            "model.layers.{bid}.linear_attn.in_proj_ba",  # qwen3next
 834        ),
 835
 836        # Kimi Linear KDA (using SSM_ prefix for consistency)
 837        MODEL_TENSOR.SSM_CONV1D_Q: (
 838            "model.layers.{bid}.self_attn.q_conv1d",
 839        ),
 840        MODEL_TENSOR.SSM_CONV1D_K: (
 841            "model.layers.{bid}.self_attn.k_conv1d",
 842        ),
 843        MODEL_TENSOR.SSM_CONV1D_V: (
 844            "model.layers.{bid}.self_attn.v_conv1d",
 845        ),
 846        MODEL_TENSOR.SSM_F_A: (
 847            "model.layers.{bid}.self_attn.f_a_proj",
 848        ),
 849        MODEL_TENSOR.SSM_F_B: (
 850            "model.layers.{bid}.self_attn.f_b_proj",
 851        ),
 852        MODEL_TENSOR.SSM_BETA: (
 853            "model.layers.{bid}.linear_attn.in_proj_b",  # qwen3.5
 854            "model.layers.{bid}.self_attn.b_proj",       # Kimi Linear
 855        ),
 856        MODEL_TENSOR.SSM_G_A: (
 857            "model.layers.{bid}.self_attn.g_a_proj",
 858        ),
 859        MODEL_TENSOR.SSM_G_B: (
 860            "model.layers.{bid}.self_attn.g_b_proj",
 861        ),
 862        MODEL_TENSOR.TIME_MIX_W0: (
 863            "model.layers.{bid}.attention.w0",            # rwkv7
 864        ),
 865
 866        MODEL_TENSOR.TIME_MIX_W1: (
 867            "rwkv.blocks.{bid}.attention.time_maa_w1",    # rwkv6
 868            "model.layers.{bid}.self_attn.time_maa_w1",   # rwkv6qwen2
 869            "model.layers.{bid}.attention.w1",            # rwkv7
 870        ),
 871
 872        MODEL_TENSOR.TIME_MIX_W2: (
 873            "rwkv.blocks.{bid}.attention.time_maa_w2",    # rwkv6
 874            "model.layers.{bid}.self_attn.time_maa_w2",   # rwkv6qwen2
 875            "model.layers.{bid}.attention.w2",            # rwkv7
 876        ),
 877
 878        MODEL_TENSOR.TIME_MIX_A0: (
 879            "model.layers.{bid}.attention.a0",            # rwkv7
 880        ),
 881
 882        MODEL_TENSOR.TIME_MIX_A1: (
 883            "model.layers.{bid}.attention.a1",            # rwkv7
 884        ),
 885
 886        MODEL_TENSOR.TIME_MIX_A2: (
 887            "model.layers.{bid}.attention.a2",            # rwkv7
 888        ),
 889
 890        MODEL_TENSOR.TIME_MIX_V0: (
 891            "model.layers.{bid}.attention.v0",            # rwkv7
 892        ),
 893
 894        MODEL_TENSOR.TIME_MIX_V1: (
 895            "model.layers.{bid}.attention.v1",            # rwkv7
 896        ),
 897
 898        MODEL_TENSOR.TIME_MIX_V2: (
 899            "model.layers.{bid}.attention.v2",            # rwkv7
 900        ),
 901
 902        MODEL_TENSOR.TIME_MIX_G1: (
 903            "model.layers.{bid}.attention.g1",            # rwkv7
 904        ),
 905
 906        MODEL_TENSOR.TIME_MIX_G2: (
 907            "model.layers.{bid}.attention.g2",            # rwkv7
 908        ),
 909
 910        MODEL_TENSOR.TIME_MIX_K_K: (
 911            "model.layers.{bid}.attention.k_k",            # rwkv7
 912        ),
 913
 914        MODEL_TENSOR.TIME_MIX_K_A: (
 915            "model.layers.{bid}.attention.k_a",            # rwkv7
 916        ),
 917
 918        MODEL_TENSOR.TIME_MIX_R_K: (
 919            "model.layers.{bid}.attention.r_k",            # rwkv7
 920        ),
 921
 922        MODEL_TENSOR.TIME_MIX_LERP_X: (
 923            "rwkv.blocks.{bid}.attention.time_maa_x",   # rwkv6
 924            "model.layers.{bid}.self_attn.time_maa_x",  # rwkv6qwen2
 925        ),
 926
 927        MODEL_TENSOR.TIME_MIX_LERP_K: (
 928            "rwkv.blocks.{bid}.attention.time_maa_k",   # rwkv6
 929            "model.layers.{bid}.self_attn.time_maa_k",  # rwkv6qwen2
 930        ),
 931
 932        MODEL_TENSOR.TIME_MIX_LERP_V: (
 933            "rwkv.blocks.{bid}.attention.time_maa_v",   # rwkv6
 934            "model.layers.{bid}.self_attn.time_maa_v",  # rwkv6qwen2
 935        ),
 936
 937        MODEL_TENSOR.TIME_MIX_LERP_R: (
 938            "rwkv.blocks.{bid}.attention.time_maa_r",   # rwkv6
 939            "model.layers.{bid}.self_attn.time_maa_r",  # rwkv6qwen2
 940        ),
 941
 942        MODEL_TENSOR.TIME_MIX_LERP_G: (
 943            "rwkv.blocks.{bid}.attention.time_maa_g",   # rwkv6
 944            "model.layers.{bid}.self_attn.time_maa_g",  # rwkv6qwen2
 945        ),
 946
 947        MODEL_TENSOR.TIME_MIX_LERP_W: (
 948            "rwkv.blocks.{bid}.attention.time_maa_w",   # rwkv6
 949            "model.layers.{bid}.self_attn.time_maa_w",  # rwkv6qwen2
 950        ),
 951
 952        MODEL_TENSOR.TIME_MIX_FIRST: (
 953            "rwkv.blocks.{bid}.attention.time_faaaa",   # rwkv6
 954        ),
 955
 956        MODEL_TENSOR.TIME_MIX_DECAY: (
 957            "rwkv.blocks.{bid}.attention.time_decay",   # rwkv6
 958            "model.layers.{bid}.self_attn.time_decay",  # rwkv6qwen2
 959        ),
 960
 961        MODEL_TENSOR.TIME_MIX_DECAY_W1: (
 962            "rwkv.blocks.{bid}.attention.time_decay_w1",  # rwkv6
 963            "model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2
 964        ),
 965
 966        MODEL_TENSOR.TIME_MIX_DECAY_W2: (
 967            "rwkv.blocks.{bid}.attention.time_decay_w2",  # rwkv6
 968            "model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2
 969        ),
 970
 971        MODEL_TENSOR.TIME_MIX_KEY: (
 972            "rwkv.blocks.{bid}.attention.key",     # rwkv6
 973            "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2
 974            "model.layers.{bid}.attention.key",    # rwkv7
 975            "model.layers.{bid}.attention.k_proj", # rwkv7
 976        ),
 977
 978        MODEL_TENSOR.TIME_MIX_VALUE: (
 979            "rwkv.blocks.{bid}.attention.value",   # rwkv6
 980            "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2
 981            "model.layers.{bid}.attention.value",  # rwkv7
 982            "model.layers.{bid}.attention.v_proj", # rwkv7
 983        ),
 984
 985        MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
 986            "rwkv.blocks.{bid}.attention.receptance",  # rwkv6
 987            "model.layers.{bid}.self_attn.q_proj",     # rwkv6qwen2
 988            "model.layers.{bid}.attention.receptance", # rwkv7
 989            "model.layers.{bid}.attention.r_proj",     # rwkv7
 990        ),
 991
 992        MODEL_TENSOR.TIME_MIX_GATE: (
 993            "rwkv.blocks.{bid}.attention.gate",        # rwkv6
 994            "model.layers.{bid}.self_attn.gate",       # rwkv6qwen2
 995        ),
 996
 997        MODEL_TENSOR.TIME_MIX_LN: (
 998            "rwkv.blocks.{bid}.attention.ln_x", # rwkv6
 999            "model.layers.{bid}.attention.ln_x" # rwkv7
1000        ),
1001
1002        MODEL_TENSOR.TIME_MIX_OUTPUT: (
1003            "rwkv.blocks.{bid}.attention.output",  # rwkv6
1004            "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2
1005            "model.layers.{bid}.attention.output", # rwkv7
1006            "model.layers.{bid}.attention.o_proj", # rwkv7
1007        ),
1008
1009        MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
1010            "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6
1011            "model.layers.{bid}.feed_forward.x_k",       # rwkv7
1012        ),
1013
1014        MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
1015            "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv6
1016        ),
1017
1018        MODEL_TENSOR.CHANNEL_MIX_KEY: (
1019            "rwkv.blocks.{bid}.feed_forward.key",  # rwkv6
1020            "model.layers.{bid}.feed_forward.key", # rwkv7
1021        ),
1022
1023        MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
1024            "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv6
1025        ),
1026
1027        MODEL_TENSOR.CHANNEL_MIX_VALUE: (
1028            "rwkv.blocks.{bid}.feed_forward.value",  # rwkv6
1029            "model.layers.{bid}.feed_forward.value", # rwkv7
1030        ),
1031
1032        MODEL_TENSOR.ATTN_Q_A: (
1033            "model.layers.{bid}.self_attn.q_a_proj", # deepseek2
1034            "layers.{bid}.attention.wq_a",           # mistral-large
1035        ),
1036
1037        MODEL_TENSOR.ATTN_Q_B: (
1038            "model.layers.{bid}.self_attn.q_b_proj", # deepseek2
1039            "layers.{bid}.attention.wq_b",           # mistral-large
1040        ),
1041
1042        MODEL_TENSOR.ATTN_KV_A_MQA: (
1043            "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
1044            "layers.{bid}.attention.wkv_a_with_mqa",           # mistral-large
1045        ),
1046
1047        MODEL_TENSOR.ATTN_KV_B: (
1048            "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
1049        ),
1050
1051        MODEL_TENSOR.ATTN_K_B: (
1052            "model.layers.{bid}.self_attn.k_b_proj",  # deepseek2
1053            "layers.{bid}.attention.k_b_proj",        # mistral-large
1054        ),
1055
1056        MODEL_TENSOR.ATTN_V_B: (
1057            "model.layers.{bid}.self_attn.v_b_proj",  # deepseek2
1058            "layers.{bid}.attention.v_b_proj",        # mistral-large
1059        ),
1060
1061        MODEL_TENSOR.ATTN_Q_A_NORM: (
1062            "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
1063            "layers.{bid}.attention.q_a_norm",            # mistral-large
1064        ),
1065
1066        MODEL_TENSOR.ATTN_KV_A_NORM: (
1067            "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
1068            "layers.{bid}.attention.kv_a_norm",            # mistral-large
1069        ),
1070
1071        MODEL_TENSOR.ATTN_SUB_NORM: (
1072            "model.layers.{bid}.self_attn.inner_attn_ln",  # bitnet
1073        ),
1074
1075        MODEL_TENSOR.FFN_SUB_NORM: (
1076            "model.layers.{bid}.mlp.ffn_layernorm",  # bitnet
1077        ),
1078
1079        MODEL_TENSOR.DEC_ATTN_NORM: (
1080            "decoder.block.{bid}.layer.0.layer_norm", # t5
1081        ),
1082
1083        MODEL_TENSOR.DEC_ATTN_Q: (
1084            "decoder.block.{bid}.layer.0.SelfAttention.q", # t5
1085        ),
1086
1087        MODEL_TENSOR.DEC_ATTN_K: (
1088            "decoder.block.{bid}.layer.0.SelfAttention.k", # t5
1089        ),
1090
1091        MODEL_TENSOR.DEC_ATTN_V: (
1092            "decoder.block.{bid}.layer.0.SelfAttention.v", # t5
1093        ),
1094
1095        MODEL_TENSOR.DEC_ATTN_OUT: (
1096            "decoder.block.{bid}.layer.0.SelfAttention.o", # t5
1097        ),
1098
1099        MODEL_TENSOR.DEC_ATTN_REL_B: (
1100            "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
1101        ),
1102
1103        MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
1104            "decoder.block.{bid}.layer.1.layer_norm", # t5
1105        ),
1106
1107        MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
1108            "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
1109        ),
1110
1111        MODEL_TENSOR.DEC_CROSS_ATTN_K: (
1112            "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
1113        ),
1114
1115        MODEL_TENSOR.DEC_CROSS_ATTN_V: (
1116            "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
1117        ),
1118
1119        MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
1120            "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
1121        ),
1122
1123        MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
1124            "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
1125        ),
1126
1127        MODEL_TENSOR.DEC_FFN_NORM: (
1128            "decoder.block.{bid}.layer.2.layer_norm", # t5
1129        ),
1130
1131        MODEL_TENSOR.DEC_FFN_GATE: (
1132            "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
1133        ),
1134
1135        MODEL_TENSOR.DEC_FFN_UP: (
1136            "decoder.block.{bid}.layer.2.DenseReluDense.wi",   # t5
1137            "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
1138        ),
1139
1140        MODEL_TENSOR.DEC_FFN_DOWN: (
1141            "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
1142        ),
1143
1144        MODEL_TENSOR.DEC_OUTPUT_NORM: (
1145            "decoder.final_layer_norm", # t5
1146        ),
1147
1148        MODEL_TENSOR.ENC_ATTN_NORM: (
1149            "encoder.block.{bid}.layer.0.layer_norm", # t5
1150        ),
1151
1152        MODEL_TENSOR.ENC_ATTN_Q: (
1153            "encoder.block.{bid}.layer.0.SelfAttention.q", # t5
1154        ),
1155
1156        MODEL_TENSOR.ENC_ATTN_K: (
1157            "encoder.block.{bid}.layer.0.SelfAttention.k", # t5
1158        ),
1159
1160        MODEL_TENSOR.ENC_ATTN_V: (
1161            "encoder.block.{bid}.layer.0.SelfAttention.v", # t5
1162        ),
1163
1164        MODEL_TENSOR.ENC_ATTN_OUT: (
1165            "encoder.block.{bid}.layer.0.SelfAttention.o", # t5
1166        ),
1167
1168        MODEL_TENSOR.ENC_ATTN_REL_B: (
1169            "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
1170        ),
1171
1172        MODEL_TENSOR.ENC_FFN_NORM: (
1173            "encoder.block.{bid}.layer.1.layer_norm", # t5
1174        ),
1175
1176        MODEL_TENSOR.ENC_FFN_GATE: (
1177            "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
1178        ),
1179
1180        MODEL_TENSOR.ENC_FFN_UP: (
1181            "encoder.block.{bid}.layer.1.DenseReluDense.wi",   # t5
1182            "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
1183        ),
1184
1185        MODEL_TENSOR.ENC_FFN_DOWN: (
1186            "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
1187        ),
1188
1189        MODEL_TENSOR.VISEXP_UP: (
1190            "model.layers.{bid}.mlp.vision_mlp.up_proj",  # cogvlm
1191        ),
1192
1193        MODEL_TENSOR.VISEXP_GATE: (
1194            "model.layers.{bid}.mlp.vision_mlp.gate_proj",  # cogvlm
1195        ),
1196
1197        MODEL_TENSOR.VISEXP_DOWN: (
1198            "model.layers.{bid}.mlp.vision_mlp.down_proj",  # cogvlm
1199        ),
1200
1201        MODEL_TENSOR.VISEXP_ATTN_OUT: (
1202            "model.layers.{bid}.self_attn.vision_expert_dense",  # cogvlm
1203        ),
1204
1205        MODEL_TENSOR.VISEXP_ATTN_QKV: (
1206            "model.layers.{bid}.self_attn.vision_expert_query_key_value",  # cogvlm
1207        ),
1208
1209        ############################################################################
1210        # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
1211        MODEL_TENSOR.ENC_OUTPUT_NORM: (
1212            "encoder.final_layer_norm", # t5
1213            "layer_norm",               # neobert
1214        ),
1215
1216        MODEL_TENSOR.CLS: (
1217            "classifier",       # jina
1218            "classifier.dense", # roberta
1219            "pre_classifier",   # distillbert
1220            "dense",            # neobert
1221            "head.dense",       # modern-bert
1222        ),
1223
1224        MODEL_TENSOR.CLS_OUT: (
1225            "classifier.out_proj", # roberta
1226        ),
1227        #############################################################################
1228
1229        MODEL_TENSOR.CONVNEXT_DW: (
1230            "backbone.convnext.{bid}.dwconv", # wavtokenizer
1231        ),
1232
1233        MODEL_TENSOR.CONVNEXT_NORM: (
1234            "backbone.convnext.{bid}.norm", # wavtokenizer
1235        ),
1236
1237        MODEL_TENSOR.CONVNEXT_PW1: (
1238            "backbone.convnext.{bid}.pwconv1", # wavtokenizer
1239        ),
1240
1241        MODEL_TENSOR.CONVNEXT_PW2: (
1242            "backbone.convnext.{bid}.pwconv2", # wavtokenizer
1243        ),
1244
1245        MODEL_TENSOR.CONVNEXT_GAMMA: (
1246            "backbone.convnext.{bid}.gamma", # wavtokenizer
1247        ),
1248
1249        MODEL_TENSOR.POSNET_CONV1: (
1250            "backbone.posnet.{bid}.conv1", # wavtokenizer
1251        ),
1252
1253        MODEL_TENSOR.POSNET_CONV2: (
1254            "backbone.posnet.{bid}.conv2", # wavtokenizer
1255        ),
1256
1257        MODEL_TENSOR.POSNET_NORM: (
1258            "backbone.posnet.{bid}.norm", # wavtokenizer
1259        ),
1260
1261        MODEL_TENSOR.POSNET_NORM1: (
1262            "backbone.posnet.{bid}.norm1", # wavtokenizer
1263        ),
1264
1265        MODEL_TENSOR.POSNET_NORM2: (
1266            "backbone.posnet.{bid}.norm2", # wavtokenizer
1267        ),
1268
1269        MODEL_TENSOR.POSNET_ATTN_NORM: (
1270            "backbone.posnet.{bid}.norm", # wavtokenizer
1271        ),
1272
1273        MODEL_TENSOR.POSNET_ATTN_Q: (
1274            "backbone.posnet.{bid}.q", # wavtokenizer
1275        ),
1276
1277        MODEL_TENSOR.POSNET_ATTN_K: (
1278            "backbone.posnet.{bid}.k", # wavtokenizer
1279        ),
1280
1281        MODEL_TENSOR.POSNET_ATTN_V: (
1282            "backbone.posnet.{bid}.v", # wavtokenizer
1283        ),
1284
1285        MODEL_TENSOR.POSNET_ATTN_OUT: (
1286            "backbone.posnet.{bid}.proj_out", # wavtokenizer
1287        ),
1288
1289        MODEL_TENSOR.SHORTCONV_CONV: (
1290            "model.layers.{bid}.conv.conv",
1291        ),
1292
1293        MODEL_TENSOR.SHORTCONV_INPROJ: (
1294            "model.layers.{bid}.conv.in_proj",
1295        ),
1296
1297        MODEL_TENSOR.SHORTCONV_OUTPROJ: (
1298            "model.layers.{bid}.conv.out_proj",
1299        ),
1300
1301        #############################################################################
1302        ## Vision encoder
1303
1304        MODEL_TENSOR.V_MMPROJ: (
1305            "multi_modal_projector.linear_{bid}",
1306            "mm_projector.proj.linear_{bid}", # Kimi-K2.5
1307            "visual.merger.mlp.{bid}", # qwen2vl
1308            "merger.mlp.{bid}",
1309        ),
1310
1311        MODEL_TENSOR.V_MMPROJ_FC: (
1312            "model.connector.modality_projection.proj", # SmolVLM
1313            "model.vision.linear_proj.linear_proj", # cogvlm
1314            "visual.merger.proj", # glm4v
1315        ),
1316
1317        MODEL_TENSOR.V_MMPROJ_MLP: (
1318            "model.mm_projector.mlp.mlp.{bid}",
1319            "vision_model.vision_adapter.mlp.fc{bid}", # llama 4
1320            "mlp1.{bid}", # InternVL
1321            "model.aligner.fc1.hidden_layers.{bid}", # Janus Pro
1322        ),
1323
1324        MODEL_TENSOR.V_MMPROJ_PEG: (
1325            "model.mm_projector.peg.peg.{bid}",
1326        ),
1327
1328        MODEL_TENSOR.V_ENC_EMBD_CLS: (
1329            "vision_tower.vision_model.embeddings.class_embedding",
1330            "model.vision_tower.embeddings.cls_token", # Intern-S1
1331            "vision_model.class_embedding", # llama 4
1332            "model.vision.patch_embedding.cls_embedding", # cogvlm
1333        ),
1334
1335        MODEL_TENSOR.V_ENC_EMBD_PATCH: (
1336            "vision_tower.vision_model.embeddings.patch_embedding",
1337            "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
1338            "vpm.embeddings.patch_embedding",
1339            "model.vision_model.embeddings.patch_embedding", # SmolVLM
1340            "vision_tower.patch_conv", # pixtral-hf
1341            "vision_encoder.patch_conv", # pixtral
1342            "vision_model.patch_embedding.linear", # llama 4
1343            "visual.patch_embed.proj", # qwen2vl
1344            "vision_tower.patch_embed.proj", # kimi-vl
1345            "model.vision.patch_embedding.proj", # cogvlm
1346            "siglip2.vision_model.embeddings.patch_embedding",
1347        ),
1348
1349        MODEL_TENSOR.V_ENC_EMBD_NORM: (
1350            "visual.post_conv_layernorm", # glm4v
1351        ),
1352
1353        MODEL_TENSOR.V_ENC_EMBD_POS: (
1354            "vision_tower.vision_model.embeddings.position_embedding",
1355            "model.vision_tower.embeddings.position_embeddings", # Intern-S1
1356            "vpm.embeddings.position_embedding",
1357            "model.vision_model.embeddings.position_embedding", # SmolVLM
1358            "vision_model.positional_embedding_vlm", # llama 4
1359            "vision_tower.patch_embed.pos_emb", # kimi-vl
1360            "visual.pos_embed", # qwen3vl
1361            "model.vision.patch_embedding.position_embedding", # cogvlm
1362            "visual.embeddings.position_embedding", # glm4v
1363        ),
1364
1365        MODEL_TENSOR.V_ENC_ATTN_QKV: (
1366            "visual.blocks.{bid}.attn.qkv", # qwen3vl
1367            "model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
1368            "vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
1369        ),
1370
1371        MODEL_TENSOR.V_ENC_ATTN_Q: (
1372            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
1373            "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
1374            "vpm.encoder.layers.{bid}.self_attn.q_proj",
1375            "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
1376            "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
1377            "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
1378            "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
1379            "visual.blocks.{bid}.attn.q", # qwen2vl, generated
1380            "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
1381            "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
1382        ),
1383
1384        MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
1385            "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
1386            "model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
1387        ),
1388
1389        MODEL_TENSOR.V_ENC_ATTN_K: (
1390            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
1391            "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
1392            "vpm.encoder.layers.{bid}.self_attn.k_proj",
1393            "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
1394            "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
1395            "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
1396            "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
1397            "visual.blocks.{bid}.attn.k", # qwen2vl, generated
1398            "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
1399            "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
1400        ),
1401
1402        MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
1403            "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
1404            "model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
1405        ),
1406
1407        MODEL_TENSOR.V_ENC_ATTN_V: (
1408            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
1409            "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
1410            "vpm.encoder.layers.{bid}.self_attn.v_proj",
1411            "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
1412            "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
1413            "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
1414            "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
1415            "visual.blocks.{bid}.attn.v", # qwen2vl, generated
1416            "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
1417            "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
1418        ),
1419
1420        MODEL_TENSOR.V_ENC_INPUT_NORM: (
1421            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
1422            "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
1423            "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
1424            "vpm.encoder.layers.{bid}.layer_norm1",
1425            "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
1426            "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
1427            "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
1428            "vision_model.model.layers.{bid}.input_layernorm", # llama4
1429            "visual.blocks.{bid}.norm1", # qwen2vl
1430            "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
1431            "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
1432            "siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
1433        ),
1434
1435        MODEL_TENSOR.V_ENC_ATTN_O: (
1436            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
1437            "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
1438            "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
1439            "vpm.encoder.layers.{bid}.self_attn.out_proj",
1440            "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
1441            "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
1442            "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
1443            "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
1444            "vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral
1445            "visual.blocks.{bid}.attn.proj", # qwen2vl
1446            "vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
1447            "model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
1448            "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
1449        ),
1450
1451        MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
1452            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
1453            "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
1454            "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
1455            "vpm.encoder.layers.{bid}.layer_norm2",
1456            "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
1457            "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
1458            "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
1459            "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
1460            "visual.blocks.{bid}.norm2", # qwen2vl
1461            "vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
1462            "model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
1463            "siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
1464        ),
1465
1466        MODEL_TENSOR.V_ENC_FFN_UP: (
1467            "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
1468            "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
1469            "vpm.encoder.layers.{bid}.mlp.fc1",
1470            "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
1471            "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
1472            "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
1473            "vision_model.model.layers.{bid}.mlp.fc1", # llama4
1474            "visual.blocks.{bid}.mlp.fc1", # qwen2vl
1475            "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
1476            "visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
1477            "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
1478            "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
1479            "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
1480        ),
1481
1482        MODEL_TENSOR.V_ENC_FFN_GATE: (
1483            "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
1484            "vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
1485            "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
1486        ),
1487
1488        MODEL_TENSOR.V_ENC_FFN_DOWN: (
1489            "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
1490            "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
1491            "vpm.encoder.layers.{bid}.mlp.fc2",
1492            "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
1493            "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
1494            "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
1495            "vision_model.model.layers.{bid}.mlp.fc2", # llama4
1496            "visual.blocks.{bid}.mlp.fc2", # qwen2vl
1497            "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
1498            "visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
1499            "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
1500            "model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
1501            "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
1502        ),
1503
1504        MODEL_TENSOR.V_LAYER_SCALE_1: (
1505            "vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
1506            "model.vision_tower.encoder.layer.{bid}.lambda_1", # Intern-S1
1507        ),
1508
1509        MODEL_TENSOR.V_LAYER_SCALE_2: (
1510            "vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
1511            "model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
1512        ),
1513
1514        MODEL_TENSOR.V_PRE_NORM: (
1515            "vision_tower.vision_model.pre_layrnorm",
1516            "vision_tower.ln_pre", # pixtral-hf
1517            "vision_encoder.ln_pre", # pixtral
1518            "vision_model.layernorm_pre", # llama4
1519        ),
1520
1521        MODEL_TENSOR.V_POST_NORM: (
1522            "vision_tower.vision_model.post_layernorm",
1523            "model.vision_model.post_layernorm", # SmolVLM
1524            "vision_model.layernorm_post", # llama4
1525            "visual.merger.ln_q", # qwen2vl
1526            "vision_tower.encoder.final_layernorm", # kimi-vl
1527            "visual.post_layernorm", # glm4v
1528            "siglip2.vision_model.post_layernorm",
1529        ),
1530
1531        MODEL_TENSOR.V_MM_POST_NORM: (
1532            "visual.merger.post_projection_norm", # glm4v
1533        ),
1534
1535        MODEL_TENSOR.V_MM_INP_PROJ: (
1536            "multi_modal_projector.mm_input_projection",
1537        ),
1538
1539        MODEL_TENSOR.V_MM_INP_NORM: (
1540            "multi_modal_projector.norm",
1541            "multi_modal_projector.layer_norm",
1542            "multi_modal_projector.pre_norm",
1543            "mm_projector.pre_norm", # Kimi-K2.5
1544            "pre_mm_projector_norm",
1545            "model.vision.linear_proj.norm1", # cogvlm
1546            "merger.ln_q",
1547        ),
1548
1549        MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
1550            "multi_modal_projector.mm_soft_emb_norm",
1551        ),
1552
1553        MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
1554            "resampler.pos_embed_k",
1555        ),
1556
1557        MODEL_TENSOR.V_RESMPL_ATTN_Q: (
1558            "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj
1559        ),
1560
1561        MODEL_TENSOR.V_RESMPL_ATTN_K: (
1562            "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj
1563        ),
1564
1565        MODEL_TENSOR.V_RESMPL_ATTN_V: (
1566            "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj
1567        ),
1568
1569        MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
1570            "resampler.attn.out_proj",
1571        ),
1572
1573        MODEL_TENSOR.V_RESMPL_KV: (
1574            "resampler.kv_proj",
1575        ),
1576
1577        MODEL_TENSOR.V_RESMPL_POST_NORM: (
1578            "resampler.ln_post",
1579        ),
1580
1581        MODEL_TENSOR.V_RESMPL_KV_NORM: (
1582            "resampler.ln_kv",
1583        ),
1584
1585        MODEL_TENSOR.V_RESMPL_Q_NORM: (
1586            "resampler.ln_q",
1587        ),
1588
1589        MODEL_TENSOR.V_RESMPL_PROJ: (
1590            "resampler.proj",
1591        ),
1592
1593        MODEL_TENSOR.V_RESMPL_QUERY: (
1594            "resampler.query",
1595        ),
1596
1597        MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: (
1598            "v.token_embd.img_break", # for pixtral, this is a generated vector
1599        ),
1600
1601        MODEL_TENSOR.V_MM_PATCH_MERGER: (
1602            "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
1603            "patch_merger.merging_layer", # mistral
1604            "visual.downsample", # glm4v
1605        ),
1606
1607        MODEL_TENSOR.V_DS_NORM: (
1608            "model.visual.deepstack_merger_list.{bid}.norm", # deepstack in qwen3vl
1609        ),
1610
1611        MODEL_TENSOR.V_DS_FC1: (
1612            "model.visual.deepstack_merger_list.{bid}.linear_fc1", # deepstack in qwen3vl
1613        ),
1614
1615        MODEL_TENSOR.V_DS_FC2: (
1616            "model.visual.deepstack_merger_list.{bid}.linear_fc2", # deepstack in qwen3vl
1617        ),
1618
1619        MODEL_TENSOR.V_MM_POST_FC_NORM: (
1620            "model.vision.linear_proj.norm1", # cogvlm
1621        ),
1622
1623        MODEL_TENSOR.V_MM_UP: (
1624            "model.vision.linear_proj.dense_h_to_4h", # cogvlm
1625            "visual.merger.up_proj", # glm4v
1626        ),
1627
1628        MODEL_TENSOR.V_MM_DOWN: (
1629            "model.vision.linear_proj.dense_4h_to_h", # cogvlm
1630            "visual.merger.down_proj", # glm4v
1631        ),
1632
1633        MODEL_TENSOR.V_MM_GATE: (
1634            "model.vision.linear_proj.gate_proj", # cogvlm
1635            "visual.merger.gate_proj", # glm4v
1636        ),
1637
1638        MODEL_TENSOR.V_TOK_BOI: (
1639            "model.vision.boi", # cogvlm
1640        ),
1641
1642        MODEL_TENSOR.V_TOK_EOI: (
1643            "model.vision.eoi", # cogvlm
1644        ),
1645
1646        # audio (mtmd)
1647
1648        MODEL_TENSOR.A_ENC_EMBD_POS: (
1649            "audio_tower.embed_positions", # ultravox
1650            "audio_embedding.embedding", # lfm2
1651        ),
1652
1653        MODEL_TENSOR.A_ENC_EMBD_NORM: (
1654            "audio_embedding.embedding_norm", # lfm2
1655        ),
1656
1657        MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: (
1658            "audio_embedding.to_logits", # lfm2
1659        ),
1660
1661        MODEL_TENSOR.A_ENC_CONV1D: (
1662            "audio_tower.conv{bid}", # ultravox
1663            "conformer.pre_encode.conv.{bid}", # lfm2
1664            "model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
1665        ),
1666
1667        MODEL_TENSOR.A_ENC_CONV1D_NORM: (
1668            "model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
1669        ),
1670
1671        MODEL_TENSOR.A_PRE_NORM: (),
1672
1673        MODEL_TENSOR.A_POST_NORM: (
1674            "audio_tower.layer_norm", # ultravox
1675            "audio_tower.ln_post", # qwen2omni
1676        ),
1677
1678        MODEL_TENSOR.A_ENC_ATTN_Q: (
1679            "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
1680            "conformer.layers.{bid}.self_attn.linear_q", # lfm2
1681            "conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
1682        ),
1683
1684        MODEL_TENSOR.A_ENC_ATTN_K: (
1685            "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
1686            "conformer.layers.{bid}.self_attn.linear_k", # lfm2
1687            "conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
1688        ),
1689
1690        MODEL_TENSOR.A_ENC_ATTN_V: (
1691            "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
1692            "conformer.layers.{bid}.self_attn.linear_v", # lfm2
1693            "conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
1694        ),
1695
1696        MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
1697            "conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
1698        ),
1699
1700        MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
1701            "conformer.layers.{bid}.norm", # gemma3n
1702        ),
1703
1704        MODEL_TENSOR.A_ENC_INPUT_NORM: (
1705            "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
1706            "conformer.layers.{bid}.norm_self_att", # lfm2
1707            "conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
1708        ),
1709
1710        MODEL_TENSOR.A_ENC_OUTPUT: (
1711            "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
1712            "conformer.layers.{bid}.self_attn.linear_out", # lfm2
1713            "conformer.layers.{bid}.attention.post", # gemma3n
1714        ),
1715
1716        MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
1717            "audio_tower.layers.{bid}.final_layer_norm", # ultravox
1718            "conformer.layers.{bid}.norm_out", # lfm2
1719            "conformer.layers.{bid}.attention.post_norm", # gemma3n
1720        ),
1721
1722        MODEL_TENSOR.A_ENC_FFN_NORM: (
1723            "conformer.layers.{bid}.norm_feed_forward1", # lfm2
1724            "conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
1725        ),
1726
1727        MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
1728            "conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
1729        ),
1730
1731        MODEL_TENSOR.A_ENC_FFN_SCALE: (
1732            "conformer.layers.{bid}.ffw_layer_start.post_layer_scale", # gemma3n
1733        ),
1734
1735        MODEL_TENSOR.A_ENC_FFN_UP: (
1736            "audio_tower.layers.{bid}.fc1", # ultravox
1737            "conformer.layers.{bid}.feed_forward1.linear1", # lfm2
1738            "conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
1739        ),
1740
1741        MODEL_TENSOR.A_ENC_FFN_GATE: (),
1742
1743        MODEL_TENSOR.A_ENC_FFN_DOWN: (
1744            "audio_tower.layers.{bid}.fc2", # ultravox
1745            "conformer.layers.{bid}.feed_forward1.linear2", # lfm2
1746            "conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
1747        ),
1748
1749        MODEL_TENSOR.A_ENC_FFN_UP_1: (
1750            "conformer.layers.{bid}.feed_forward2.linear1", # lfm2
1751            "conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
1752        ),
1753
1754        MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
1755            "conformer.layers.{bid}.feed_forward2.linear2", # lfm2
1756            "conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
1757        ),
1758
1759        MODEL_TENSOR.A_ENC_FFN_NORM_1: (
1760            "conformer.layers.{bid}.norm_feed_forward2", # lfm2
1761            "conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
1762        ),
1763
1764        MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
1765            "conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
1766        ),
1767
1768        MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
1769            "conformer.layers.{bid}.ffw_layer_end.post_layer_scale", # gemma3n
1770        ),
1771
1772        MODEL_TENSOR.A_ENC_LINEAR_POS: (
1773            "conformer.layers.{bid}.self_attn.linear_pos", # lfm2
1774            "conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
1775        ),
1776
1777        MODEL_TENSOR.A_ENC_POS_BIAS_U: (
1778            "conformer.layers.{bid}.self_attn.pos_bias_u", # lfm2
1779        ),
1780
1781        MODEL_TENSOR.A_ENC_POS_BIAS_V: (
1782            "conformer.layers.{bid}.self_attn.pos_bias_v", # lfm2
1783        ),
1784
1785        MODEL_TENSOR.A_ENC_OUT: (
1786            "conformer.pre_encode.out", # lfm2
1787            "model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
1788        ),
1789
1790        # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
1791        # this prefix is added in the conversion code in modify_tensors()
1792
1793        MODEL_TENSOR.A_MMPROJ: (
1794            "audio.multi_modal_projector.linear_{bid}", # ultravox
1795            "audio_adapter.model.{bid}" # lfm2
1796        ),
1797
1798        MODEL_TENSOR.A_MMPROJ_FC: (
1799            "audio.multi_modal_projector.linear", # qwen2audio
1800            "audio_tower.proj", # qwen2omni
1801        ),
1802
1803        MODEL_TENSOR.A_MM_NORM_PRE: (
1804            "audio.multi_modal_projector.ln_pre", # ultravox
1805        ),
1806
1807        MODEL_TENSOR.A_MM_NORM_MID: (
1808            "audio.multi_modal_projector.ln_mid", # ultravox
1809        ),
1810
1811        MODEL_TENSOR.A_ENC_CONV_DW: (
1812            "conformer.layers.{bid}.conv.depthwise_conv", # lfm2
1813            "conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
1814        ),
1815
1816        MODEL_TENSOR.A_ENC_CONV_NORM: (
1817            "conformer.layers.{bid}.conv.batch_norm", # lfm2
1818            "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
1819        ),
1820
1821        MODEL_TENSOR.A_ENC_CONV_PW1: (
1822            "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
1823            "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
1824        ),
1825
1826        MODEL_TENSOR.A_ENC_CONV_PW2: (
1827            "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
1828            "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
1829        ),
1830
1831        MODEL_TENSOR.A_ENC_NORM_CONV: (
1832            "conformer.layers.{bid}.norm_conv", # lfm2
1833            "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
1834        ),
1835
1836        MODEL_TENSOR.A_MM_EMBEDDING: (
1837            "model.embed_audio.embedding", # gemma3n
1838        ),
1839        MODEL_TENSOR.A_MM_HARD_EMB_NORM: (
1840            "model.embed_audio.hard_embedding_norm", # gemma3n
1841        ),
1842        MODEL_TENSOR.A_MM_INP_PROJ: (
1843            "model.embed_audio.embedding_projection", # gemma3n
1844        ),
1845        MODEL_TENSOR.A_MM_SOFT_EMB_NORM: (
1846            "model.embed_audio.soft_embedding_norm", # gemma3n
1847        ),
1848
1849        # NextN/MTP tensors
1850        MODEL_TENSOR.NEXTN_EH_PROJ: (
1851            "model.layers.{bid}.eh_proj",
1852        ),
1853
1854        MODEL_TENSOR.NEXTN_EMBED_TOKENS: (
1855            "model.layers.{bid}.embed_tokens",
1856        ),
1857
1858        MODEL_TENSOR.NEXTN_ENORM: (
1859            "model.layers.{bid}.enorm",
1860        ),
1861
1862        MODEL_TENSOR.NEXTN_HNORM: (
1863            "model.layers.{bid}.hnorm",
1864        ),
1865
1866        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: (
1867            "model.layers.{bid}.shared_head.head",
1868        ),
1869
1870        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: (
1871            "model.layers.{bid}.shared_head.norm",
1872        ),
1873    }
1874
1875    # architecture-specific block mappings
1876    arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
1877        MODEL_ARCH.ARCTIC: {
1878            MODEL_TENSOR.FFN_NORM: (
1879                "model.layers.{bid}.residual_layernorm",
1880            ),
1881            MODEL_TENSOR.FFN_NORM_EXP: (
1882                "model.layers.{bid}.post_attention_layernorm",
1883            ),
1884        },
1885    }
1886
1887    mapping: dict[str, tuple[MODEL_TENSOR, str]]
1888
1889    def __init__(self, arch: MODEL_ARCH, n_blocks: int):
1890        self.mapping = {}
1891        for tensor, keys in self.mappings_cfg.items():
1892            if tensor not in MODEL_TENSORS[arch]:
1893                continue
1894            tensor_name = TENSOR_NAMES[tensor]
1895            self.mapping[tensor_name] = (tensor, tensor_name)
1896            for key in keys:
1897                self.mapping[key] = (tensor, tensor_name)
1898        if arch in self.arch_block_mappings_cfg:
1899            self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
1900        for bid in range(n_blocks):
1901            for tensor, keys in self.block_mappings_cfg.items():
1902                if tensor not in MODEL_TENSORS[arch]:
1903                    continue
1904
1905                tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
1906                self.mapping[tensor_name] = (tensor, tensor_name)
1907                for key in keys:
1908                    key = key.format(bid = bid)
1909                    self.mapping[key] = (tensor, tensor_name)
1910
1911    def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
1912        result = self.mapping.get(key)
1913        if result is not None:
1914            return result
1915        for suffix in try_suffixes:
1916            if key.endswith(suffix):
1917                result = self.mapping.get(key[:-len(suffix)])
1918                if result is not None:
1919                    return result[0], result[1] + suffix
1920        return None
1921
1922    def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
1923        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
1924        if result is None:
1925            return None
1926        return result[1]
1927
1928    def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
1929        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
1930        if result is None:
1931            return None
1932        return result[0]
1933
1934    def __getitem__(self, key: str) -> str:
1935        try:
1936            return self.mapping[key][1]
1937        except KeyError:
1938            raise KeyError(key)
1939
1940    def __contains__(self, key: str) -> bool:
1941        return key in self.mapping
1942
1943    def __repr__(self) -> str:
1944        return repr(self.mapping)
1945
1946
1947def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
1948    return TensorNameMap(arch, n_blocks)