summaryrefslogtreecommitdiff
path: root/llama.cpp/src/llama-arch.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/src/llama-arch.h')
-rw-r--r--llama.cpp/src/llama-arch.h606
1 files changed, 606 insertions, 0 deletions
diff --git a/llama.cpp/src/llama-arch.h b/llama.cpp/src/llama-arch.h
new file mode 100644
index 0000000..4f7b51e
--- /dev/null
+++ b/llama.cpp/src/llama-arch.h
@@ -0,0 +1,606 @@
+#pragma once
+
+#include "ggml.h" // ggml_op
+
+#include <string>
+#include <set>
+
+//
+// gguf constants (sync with gguf.py)
+//
+
+enum llm_arch {
+ LLM_ARCH_CLIP,
+ LLM_ARCH_LLAMA,
+ LLM_ARCH_LLAMA4,
+ LLM_ARCH_DECI,
+ LLM_ARCH_FALCON,
+ LLM_ARCH_BAICHUAN,
+ LLM_ARCH_GROK,
+ LLM_ARCH_GPT2,
+ LLM_ARCH_GPTJ,
+ LLM_ARCH_GPTNEOX,
+ LLM_ARCH_MPT,
+ LLM_ARCH_STARCODER,
+ LLM_ARCH_REFACT,
+ LLM_ARCH_BERT,
+ LLM_ARCH_MODERN_BERT,
+ LLM_ARCH_NOMIC_BERT,
+ LLM_ARCH_NOMIC_BERT_MOE,
+ LLM_ARCH_NEO_BERT,
+ LLM_ARCH_JINA_BERT_V2,
+ LLM_ARCH_JINA_BERT_V3,
+ LLM_ARCH_BLOOM,
+ LLM_ARCH_STABLELM,
+ LLM_ARCH_QWEN,
+ LLM_ARCH_QWEN2,
+ LLM_ARCH_QWEN2MOE,
+ LLM_ARCH_QWEN2VL,
+ LLM_ARCH_QWEN3,
+ LLM_ARCH_QWEN3MOE,
+ LLM_ARCH_QWEN3NEXT,
+ LLM_ARCH_QWEN3VL,
+ LLM_ARCH_QWEN3VLMOE,
+ LLM_ARCH_QWEN35,
+ LLM_ARCH_QWEN35MOE,
+ LLM_ARCH_PHI2,
+ LLM_ARCH_PHI3,
+ LLM_ARCH_PHIMOE,
+ LLM_ARCH_PLAMO,
+ LLM_ARCH_PLAMO2,
+ LLM_ARCH_PLAMO3,
+ LLM_ARCH_CODESHELL,
+ LLM_ARCH_ORION,
+ LLM_ARCH_INTERNLM2,
+ LLM_ARCH_MINICPM,
+ LLM_ARCH_MINICPM3,
+ LLM_ARCH_GEMMA,
+ LLM_ARCH_GEMMA2,
+ LLM_ARCH_GEMMA3,
+ LLM_ARCH_GEMMA3N,
+ LLM_ARCH_GEMMA_EMBEDDING,
+ LLM_ARCH_STARCODER2,
+ LLM_ARCH_MAMBA,
+ LLM_ARCH_MAMBA2,
+ LLM_ARCH_JAMBA,
+ LLM_ARCH_FALCON_H1,
+ LLM_ARCH_XVERSE,
+ LLM_ARCH_COMMAND_R,
+ LLM_ARCH_COHERE2,
+ LLM_ARCH_DBRX,
+ LLM_ARCH_OLMO,
+ LLM_ARCH_OLMO2,
+ LLM_ARCH_OLMOE,
+ LLM_ARCH_OPENELM,
+ LLM_ARCH_ARCTIC,
+ LLM_ARCH_DEEPSEEK,
+ LLM_ARCH_DEEPSEEK2,
+ LLM_ARCH_CHATGLM,
+ LLM_ARCH_GLM4,
+ LLM_ARCH_GLM4_MOE,
+ LLM_ARCH_BITNET,
+ LLM_ARCH_T5,
+ LLM_ARCH_T5ENCODER,
+ LLM_ARCH_JAIS,
+ LLM_ARCH_NEMOTRON,
+ LLM_ARCH_NEMOTRON_H,
+ LLM_ARCH_NEMOTRON_H_MOE,
+ LLM_ARCH_EXAONE,
+ LLM_ARCH_EXAONE4,
+ LLM_ARCH_EXAONE_MOE,
+ LLM_ARCH_RWKV6,
+ LLM_ARCH_RWKV6QWEN2,
+ LLM_ARCH_RWKV7,
+ LLM_ARCH_ARWKV7,
+ LLM_ARCH_GRANITE,
+ LLM_ARCH_GRANITE_MOE,
+ LLM_ARCH_GRANITE_HYBRID,
+ LLM_ARCH_CHAMELEON,
+ LLM_ARCH_WAVTOKENIZER_DEC,
+ LLM_ARCH_PLM,
+ LLM_ARCH_BAILINGMOE,
+ LLM_ARCH_BAILINGMOE2,
+ LLM_ARCH_DOTS1,
+ LLM_ARCH_ARCEE,
+ LLM_ARCH_AFMOE,
+ LLM_ARCH_ERNIE4_5,
+ LLM_ARCH_ERNIE4_5_MOE,
+ LLM_ARCH_HUNYUAN_MOE,
+ LLM_ARCH_HUNYUAN_DENSE,
+ LLM_ARCH_SMOLLM3,
+ LLM_ARCH_OPENAI_MOE,
+ LLM_ARCH_LFM2,
+ LLM_ARCH_LFM2MOE,
+ LLM_ARCH_DREAM,
+ LLM_ARCH_SMALLTHINKER,
+ LLM_ARCH_LLADA,
+ LLM_ARCH_LLADA_MOE,
+ LLM_ARCH_SEED_OSS,
+ LLM_ARCH_GROVEMOE,
+ LLM_ARCH_APERTUS,
+ LLM_ARCH_MINIMAX_M2,
+ LLM_ARCH_COGVLM,
+ LLM_ARCH_RND1,
+ LLM_ARCH_PANGU_EMBED,
+ LLM_ARCH_MISTRAL3,
+ LLM_ARCH_MIMO2,
+ LLM_ARCH_STEP35,
+ LLM_ARCH_LLAMA_EMBED,
+ LLM_ARCH_MAINCODER,
+ LLM_ARCH_KIMI_LINEAR,
+ LLM_ARCH_UNKNOWN,
+};
+
+enum llm_kv {
+ LLM_KV_GENERAL_TYPE,
+ LLM_KV_GENERAL_ARCHITECTURE,
+ LLM_KV_GENERAL_QUANTIZATION_VERSION,
+ LLM_KV_GENERAL_ALIGNMENT,
+ LLM_KV_GENERAL_FILE_TYPE,
+ LLM_KV_GENERAL_SAMPLING_SEQUENCE,
+ LLM_KV_GENERAL_SAMPLING_TOP_K,
+ LLM_KV_GENERAL_SAMPLING_TOP_P,
+ LLM_KV_GENERAL_SAMPLING_MIN_P,
+ LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
+ LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
+ LLM_KV_GENERAL_SAMPLING_TEMP,
+ LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
+ LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT,
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
+ LLM_KV_GENERAL_NAME,
+ LLM_KV_GENERAL_AUTHOR,
+ LLM_KV_GENERAL_VERSION,
+ LLM_KV_GENERAL_URL,
+ LLM_KV_GENERAL_DESCRIPTION,
+ LLM_KV_GENERAL_LICENSE,
+ LLM_KV_GENERAL_SOURCE_URL,
+ LLM_KV_GENERAL_SOURCE_HF_REPO,
+
+ LLM_KV_VOCAB_SIZE,
+ LLM_KV_CONTEXT_LENGTH,
+ LLM_KV_EMBEDDING_LENGTH,
+ LLM_KV_EMBEDDING_LENGTH_OUT,
+ LLM_KV_FEATURES_LENGTH,
+ LLM_KV_BLOCK_COUNT,
+ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
+ LLM_KV_FEED_FORWARD_LENGTH,
+ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
+ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
+ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
+ LLM_KV_SWIGLU_CLAMP_EXP,
+ LLM_KV_SWIGLU_CLAMP_SHEXP,
+ LLM_KV_USE_PARALLEL_RESIDUAL,
+ LLM_KV_TENSOR_DATA_LAYOUT,
+ LLM_KV_EXPERT_COUNT,
+ LLM_KV_EXPERT_USED_COUNT,
+ LLM_KV_EXPERT_SHARED_COUNT,
+ LLM_KV_EXPERT_GROUP_COUNT,
+ LLM_KV_EXPERT_GROUP_USED_COUNT,
+ LLM_KV_EXPERT_WEIGHTS_SCALE,
+ LLM_KV_EXPERT_WEIGHTS_NORM,
+ LLM_KV_EXPERT_GATING_FUNC,
+ LLM_KV_EXPERT_GROUP_SCALE,
+ LLM_KV_EXPERTS_PER_GROUP,
+ LLM_KV_MOE_EVERY_N_LAYERS,
+ LLM_KV_NEXTN_PREDICT_LAYERS,
+ LLM_KV_NUM_DEEPSTACK_LAYERS,
+ LLM_KV_POOLING_TYPE,
+ LLM_KV_LOGIT_SCALE,
+ LLM_KV_DECODER_START_TOKEN_ID,
+ LLM_KV_DECODER_BLOCK_COUNT,
+ LLM_KV_ATTN_LOGIT_SOFTCAPPING,
+ LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
+ LLM_KV_FINAL_LOGIT_SOFTCAPPING,
+ LLM_KV_SWIN_NORM,
+ LLM_KV_RESCALE_EVERY_N_LAYERS,
+ LLM_KV_TIME_MIX_EXTRA_DIM,
+ LLM_KV_TIME_DECAY_EXTRA_DIM,
+ LLM_KV_RESIDUAL_SCALE,
+ LLM_KV_EMBEDDING_SCALE,
+ LLM_KV_TOKEN_SHIFT_COUNT,
+ LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
+ LLM_KV_FULL_ATTENTION_INTERVAL,
+
+ LLM_KV_ATTENTION_HEAD_COUNT,
+ LLM_KV_ATTENTION_HEAD_COUNT_KV,
+ LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
+ LLM_KV_ATTENTION_CLAMP_KQV,
+ LLM_KV_ATTENTION_KEY_LENGTH,
+ LLM_KV_ATTENTION_VALUE_LENGTH,
+ LLM_KV_ATTENTION_LAYERNORM_EPS,
+ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+ LLM_KV_ATTENTION_GROUPNORM_EPS,
+ LLM_KV_ATTENTION_GROUPNORM_GROUPS,
+ LLM_KV_ATTENTION_CAUSAL,
+ LLM_KV_ATTENTION_Q_LORA_RANK,
+ LLM_KV_ATTENTION_KV_LORA_RANK,
+ LLM_KV_ATTENTION_DECAY_LORA_RANK,
+ LLM_KV_ATTENTION_ICLR_LORA_RANK,
+ LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK,
+ LLM_KV_ATTENTION_GATE_LORA_RANK,
+ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
+ LLM_KV_ATTENTION_SLIDING_WINDOW,
+ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
+ LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_OUTPUT_SCALE,
+ LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
+ LLM_KV_ATTENTION_TEMPERATURE_SCALE,
+ LLM_KV_ATTENTION_KEY_LENGTH_MLA,
+ LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
+
+ LLM_KV_ROPE_DIMENSION_COUNT,
+ LLM_KV_ROPE_DIMENSION_SECTIONS,
+ LLM_KV_ROPE_FREQ_BASE,
+ LLM_KV_ROPE_FREQ_BASE_SWA,
+ LLM_KV_ROPE_SCALE_LINEAR,
+ LLM_KV_ROPE_SCALING_TYPE,
+ LLM_KV_ROPE_SCALING_FACTOR,
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
+ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+ LLM_KV_ROPE_SCALING_FINETUNED,
+ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
+ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
+ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
+ LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
+ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
+
+ LLM_KV_SPLIT_NO,
+ LLM_KV_SPLIT_COUNT,
+ LLM_KV_SPLIT_TENSORS_COUNT,
+
+ LLM_KV_SSM_INNER_SIZE,
+ LLM_KV_SSM_CONV_KERNEL,
+ LLM_KV_SSM_STATE_SIZE,
+ LLM_KV_SSM_TIME_STEP_RANK,
+ LLM_KV_SSM_GROUP_COUNT,
+ LLM_KV_SSM_DT_B_C_RMS,
+
+ LLM_KV_KDA_HEAD_DIM,
+
+ LLM_KV_WKV_HEAD_SIZE,
+
+ LLM_KV_TOKENIZER_MODEL,
+ LLM_KV_TOKENIZER_PRE,
+ LLM_KV_TOKENIZER_LIST,
+ LLM_KV_TOKENIZER_TOKEN_TYPE,
+ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
+ LLM_KV_TOKENIZER_SCORES,
+ LLM_KV_TOKENIZER_MERGES,
+ LLM_KV_TOKENIZER_BOS_ID,
+ LLM_KV_TOKENIZER_EOS_ID,
+ LLM_KV_TOKENIZER_EOT_ID,
+ LLM_KV_TOKENIZER_EOM_ID,
+ LLM_KV_TOKENIZER_UNK_ID,
+ LLM_KV_TOKENIZER_SEP_ID,
+ LLM_KV_TOKENIZER_PAD_ID,
+ LLM_KV_TOKENIZER_CLS_ID,
+ LLM_KV_TOKENIZER_MASK_ID,
+ LLM_KV_TOKENIZER_ADD_BOS,
+ LLM_KV_TOKENIZER_ADD_EOS,
+ LLM_KV_TOKENIZER_ADD_SEP,
+ LLM_KV_TOKENIZER_ADD_PREFIX,
+ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
+ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
+ LLM_KV_TOKENIZER_HF_JSON,
+ LLM_KV_TOKENIZER_RWKV,
+ LLM_KV_TOKENIZER_CHAT_TEMPLATE,
+ LLM_KV_TOKENIZER_FIM_PRE_ID,
+ LLM_KV_TOKENIZER_FIM_SUF_ID,
+ LLM_KV_TOKENIZER_FIM_MID_ID,
+ LLM_KV_TOKENIZER_FIM_PAD_ID,
+ LLM_KV_TOKENIZER_FIM_REP_ID,
+ LLM_KV_TOKENIZER_FIM_SEP_ID,
+
+ LLM_KV_ADAPTER_TYPE,
+ LLM_KV_ADAPTER_LORA_ALPHA,
+ LLM_KV_ADAPTER_LORA_TASK_NAME,
+ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
+ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
+
+ LLM_KV_POSNET_EMBEDDING_LENGTH,
+ LLM_KV_POSNET_BLOCK_COUNT,
+
+ LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
+ LLM_KV_CONVNEXT_BLOCK_COUNT,
+
+ LLM_KV_CLASSIFIER_OUTPUT_LABELS,
+
+ LLM_KV_SHORTCONV_L_CACHE,
+
+ LLM_KV_XIELU_ALPHA_N,
+ LLM_KV_XIELU_ALPHA_P,
+ LLM_KV_XIELU_BETA,
+ LLM_KV_XIELU_EPS,
+
+ // deprecated:
+ LLM_KV_TOKENIZER_PREFIX_ID,
+ LLM_KV_TOKENIZER_SUFFIX_ID,
+ LLM_KV_TOKENIZER_MIDDLE_ID,
+
+ // sentence-transformers dense layers in and out features
+ LLM_KV_DENSE_2_FEAT_IN,
+ LLM_KV_DENSE_2_FEAT_OUT,
+ LLM_KV_DENSE_3_FEAT_IN,
+ LLM_KV_DENSE_3_FEAT_OUT,
+};
+
+enum llm_tensor {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_TOKEN_TYPES,
+ LLM_TENSOR_POS_EMBD,
+ LLM_TENSOR_DENSE_2_OUT,
+ LLM_TENSOR_DENSE_3_OUT,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ROPE_FACTORS_LONG,
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_NORM_2,
+ LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_ATTN_SINKS,
+ LLM_TENSOR_ATTN_GATE,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_ACT,
+ LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
+ LLM_TENSOR_FFN_GATE_EXP,
+ LLM_TENSOR_FFN_UP_EXP,
+ LLM_TENSOR_FFN_NORM_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_DOWN_CHEXPS,
+ LLM_TENSOR_FFN_GATE_CHEXPS,
+ LLM_TENSOR_FFN_UP_CHEXPS,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_LAYER_OUT_NORM,
+ LLM_TENSOR_POST_ATTN_NORM,
+ LLM_TENSOR_POST_MLP_NORM,
+ LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
+ LLM_TENSOR_PER_LAYER_MODEL_PROJ, // gemma3n
+ LLM_TENSOR_PER_LAYER_INP_GATE, // gemma3n
+ LLM_TENSOR_PER_LAYER_PROJ, // gemma3n
+ LLM_TENSOR_PER_LAYER_PROJ_NORM, // gemma3n
+ LLM_TENSOR_PER_LAYER_POST_NORM, // gemma3n
+ LLM_TENSOR_ALTUP_PROJ, // gemma3n
+ LLM_TENSOR_ALTUP_UNEMBD_PROJ, // gemma3n
+ LLM_TENSOR_ALTUP_CORRECT_COEF, // gemma3n
+ LLM_TENSOR_ALTUP_CORRECT_SCALE, // gemma3n
+ LLM_TENSOR_ALTUP_PREDICT_COEF, // gemma3n
+ LLM_TENSOR_ALTUP_ROUTER, // gemma3n
+ LLM_TENSOR_ALTUP_ROUTER_NORM, // gemma3n
+ LLM_TENSOR_LAUREL_L, // gemma3n
+ LLM_TENSOR_LAUREL_R, // gemma3n
+ LLM_TENSOR_LAUREL_POST_NORM, // gemma3n
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_X,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_DT_NORM,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN
+ LLM_TENSOR_SSM_B_NORM,
+ LLM_TENSOR_SSM_C_NORM,
+ LLM_TENSOR_SSM_D,
+ LLM_TENSOR_SSM_NORM,
+ LLM_TENSOR_SSM_OUT,
+ LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
+ LLM_TENSOR_SSM_ALPHA, // qwen3.5
+ // Kimi Linear KDA (using SSM_ prefix for consistency)
+ LLM_TENSOR_SSM_CONV1D_Q, // kimi: Q conv1d weight
+ LLM_TENSOR_SSM_CONV1D_K, // kimi: K conv1d weight
+ LLM_TENSOR_SSM_CONV1D_V, // kimi: V conv1d weight
+ LLM_TENSOR_SSM_F_A, // kimi: forget gate projection A
+ LLM_TENSOR_SSM_F_B, // kimi: forget gate projection B
+ LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient and qwen3.5
+ LLM_TENSOR_SSM_G_A, // kimi: output gate projection A
+ LLM_TENSOR_SSM_G_B, // kimi: output gate projection B
+ LLM_TENSOR_TIME_MIX_W0,
+ LLM_TENSOR_TIME_MIX_W1,
+ LLM_TENSOR_TIME_MIX_W2,
+ LLM_TENSOR_TIME_MIX_A0,
+ LLM_TENSOR_TIME_MIX_A1,
+ LLM_TENSOR_TIME_MIX_A2,
+ LLM_TENSOR_TIME_MIX_V0,
+ LLM_TENSOR_TIME_MIX_V1,
+ LLM_TENSOR_TIME_MIX_V2,
+ LLM_TENSOR_TIME_MIX_G1,
+ LLM_TENSOR_TIME_MIX_G2,
+ LLM_TENSOR_TIME_MIX_K_K,
+ LLM_TENSOR_TIME_MIX_K_A,
+ LLM_TENSOR_TIME_MIX_R_K,
+ LLM_TENSOR_TIME_MIX_LERP_X,
+ LLM_TENSOR_TIME_MIX_LERP_W,
+ LLM_TENSOR_TIME_MIX_LERP_K,
+ LLM_TENSOR_TIME_MIX_LERP_V,
+ LLM_TENSOR_TIME_MIX_LERP_R,
+ LLM_TENSOR_TIME_MIX_LERP_G,
+ LLM_TENSOR_TIME_MIX_LERP_FUSED,
+ LLM_TENSOR_TIME_MIX_FIRST,
+ LLM_TENSOR_TIME_MIX_DECAY,
+ LLM_TENSOR_TIME_MIX_DECAY_W1,
+ LLM_TENSOR_TIME_MIX_DECAY_W2,
+ LLM_TENSOR_TIME_MIX_KEY,
+ LLM_TENSOR_TIME_MIX_VALUE,
+ LLM_TENSOR_TIME_MIX_RECEPTANCE,
+ LLM_TENSOR_TIME_MIX_GATE,
+ LLM_TENSOR_TIME_MIX_LN,
+ LLM_TENSOR_TIME_MIX_OUTPUT,
+ LLM_TENSOR_CHANNEL_MIX_LERP_K,
+ LLM_TENSOR_CHANNEL_MIX_LERP_R,
+ LLM_TENSOR_CHANNEL_MIX_KEY,
+ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
+ LLM_TENSOR_CHANNEL_MIX_VALUE,
+ LLM_TENSOR_ATTN_Q_A,
+ LLM_TENSOR_ATTN_Q_B,
+ LLM_TENSOR_ATTN_KV_A_MQA,
+ LLM_TENSOR_ATTN_KV_B,
+ LLM_TENSOR_ATTN_K_B,
+ LLM_TENSOR_ATTN_V_B,
+ LLM_TENSOR_ATTN_Q_A_NORM,
+ LLM_TENSOR_ATTN_KV_A_NORM,
+ LLM_TENSOR_ATTN_SUB_NORM,
+ LLM_TENSOR_FFN_SUB_NORM,
+ LLM_TENSOR_DEC_ATTN_NORM,
+ LLM_TENSOR_DEC_ATTN_Q,
+ LLM_TENSOR_DEC_ATTN_K,
+ LLM_TENSOR_DEC_ATTN_V,
+ LLM_TENSOR_DEC_ATTN_OUT,
+ LLM_TENSOR_DEC_ATTN_REL_B,
+ LLM_TENSOR_DEC_CROSS_ATTN_NORM,
+ LLM_TENSOR_DEC_CROSS_ATTN_Q,
+ LLM_TENSOR_DEC_CROSS_ATTN_K,
+ LLM_TENSOR_DEC_CROSS_ATTN_V,
+ LLM_TENSOR_DEC_CROSS_ATTN_OUT,
+ LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
+ LLM_TENSOR_DEC_FFN_NORM,
+ LLM_TENSOR_DEC_FFN_GATE,
+ LLM_TENSOR_DEC_FFN_DOWN,
+ LLM_TENSOR_DEC_FFN_UP,
+ LLM_TENSOR_DEC_OUTPUT_NORM,
+ LLM_TENSOR_ENC_ATTN_NORM,
+ LLM_TENSOR_ENC_ATTN_Q,
+ LLM_TENSOR_ENC_ATTN_K,
+ LLM_TENSOR_ENC_ATTN_V,
+ LLM_TENSOR_ENC_ATTN_OUT,
+ LLM_TENSOR_ENC_ATTN_REL_B,
+ LLM_TENSOR_ENC_FFN_NORM,
+ LLM_TENSOR_ENC_FFN_GATE,
+ LLM_TENSOR_ENC_FFN_DOWN,
+ LLM_TENSOR_ENC_FFN_UP,
+ LLM_TENSOR_ENC_OUTPUT_NORM,
+ LLM_TENSOR_CLS,
+ LLM_TENSOR_CLS_OUT,
+ LLM_TENSOR_CONV1D,
+ LLM_TENSOR_CONVNEXT_DW,
+ LLM_TENSOR_CONVNEXT_NORM,
+ LLM_TENSOR_CONVNEXT_PW1,
+ LLM_TENSOR_CONVNEXT_PW2,
+ LLM_TENSOR_CONVNEXT_GAMMA,
+ LLM_TENSOR_POS_NET_CONV1,
+ LLM_TENSOR_POS_NET_CONV2,
+ LLM_TENSOR_POS_NET_NORM,
+ LLM_TENSOR_POS_NET_NORM1,
+ LLM_TENSOR_POS_NET_NORM2,
+ LLM_TENSOR_POS_NET_ATTN_NORM,
+ LLM_TENSOR_POS_NET_ATTN_Q,
+ LLM_TENSOR_POS_NET_ATTN_K,
+ LLM_TENSOR_POS_NET_ATTN_V,
+ LLM_TENSOR_POS_NET_ATTN_OUT,
+ LLM_TENSOR_SHORTCONV_CONV,
+ LLM_TENSOR_SHORTCONV_INPROJ,
+ LLM_TENSOR_SHORTCONV_OUTPROJ,
+ LLM_TENSOR_VISEXP_ATTN_QKV,
+ LLM_TENSOR_VISEXP_ATTN_OUT,
+ LLM_TENSOR_VISEXP_FFN_GATE,
+ LLM_TENSOR_VISEXP_FFN_DOWN,
+ LLM_TENSOR_VISEXP_FFN_UP,
+ LLM_TENSOR_NEXTN_EH_PROJ,
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
+ LLM_TENSOR_NEXTN_ENORM,
+ LLM_TENSOR_NEXTN_HNORM,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+};
+
+enum llm_tensor_layer {
+ LLM_TENSOR_LAYER_INPUT,
+ LLM_TENSOR_LAYER_REPEATING,
+ LLM_TENSOR_LAYER_OUTPUT,
+};
+
+struct LLM_KV {
+ LLM_KV(llm_arch arch, const char * suffix = nullptr);
+
+ llm_arch arch;
+ const char * suffix;
+
+ std::string operator()(llm_kv kv) const;
+};
+
+// helper to handle gguf constants
+// usage:
+//
+// const auto tn = LLM_TN(LLM_ARCH_LLAMA);
+//
+// std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
+// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
+// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
+//
+struct LLM_TN_IMPL {
+ const llm_arch arch;
+ const llm_tensor tensor;
+ const char * const suffix;
+ const int bid;
+ const int xid;
+
+ const std::set<llm_tensor> model_tensors;
+
+ LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
+
+ std::string str() const;
+
+ operator std::string() const {
+ return str();
+ }
+
+ friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
+ return str == tn.str();
+ }
+
+ friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
+ return str != tn.str();
+ }
+};
+
+struct LLM_TN {
+ LLM_TN(llm_arch arch) : arch(arch) {}
+
+ llm_arch arch;
+
+ LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
+ return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
+ }
+
+ LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
+ return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
+ }
+};
+
+
+struct llm_tensor_info {
+ llm_tensor_layer layer;
+ ggml_op op;
+};
+
+const char * llm_arch_name(llm_arch arch);
+
+llm_arch llm_arch_from_string(const std::string & name);
+
+const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
+
+bool llm_arch_is_recurrent(const llm_arch & arch);
+bool llm_arch_is_hybrid (const llm_arch & arch);
+bool llm_arch_is_diffusion(const llm_arch & arch);