llama.cpp
.devops
nix
apps.nix devshells.nix docker.nix jetson-support.nix nixpkgs-instances.nix package-gguf-py.nix package.nix python-scripts.nix scope.nix sif.nix.github
ISSUE_TEMPLATE
010-bug-compilation.yml 011-bug-results.yml 019-bug-misc.yml 020-enhancement.yml 030-research.yml 040-refactor.yml config.ymlworkflows
bench.yml.disabled build-cache.yml build-cmake-pkg.yml build-linux-cross.yml build.yml check-vendor.yml close-issue.yml copilot-setup-steps.yml docker.yml editorconfig.yml gguf-publish.yml labeler.yml pre-tokenizer-hashes.yml python-check-requirements.yml python-lint.yml python-type-check.yml release.yml server-metal.yml server-webui.yml server.yml update-ops-docs.yml winget.ymlbenches
cmake
arm64-apple-clang.cmake arm64-windows-llvm.cmake build-info.cmake common.cmake download-models.cmake git-vars.cmake license.cmake llama-config.cmake.in llama.pc.in riscv64-spacemit-linux-gnu-gcc.cmake x64-windows-llvm.cmakecommon
jinja
README.md caps.cpp caps.h lexer.cpp lexer.h parser.cpp parser.h runtime.cpp runtime.h string.cpp string.h utils.h value.cpp value.hdocs
multimodal
MobileVLM.md gemma3.md glmedge.md granitevision.md llava.md minicpmo2.6.md minicpmo4.0.md minicpmv2.5.md minicpmv2.6.md minicpmv4.0.md minicpmv4.5.mdops
BLAS.csv CANN.csv CPU.csv CUDA.csv Metal.csv OpenCL.csv SYCL.csv Vulkan.csv WebGPU.csv ZenDNN.csv zDNN.csvexamples
llama.android
app
src
lib
.gitignore build.gradle.kts consumer-rules.pro proguard-rules.promodel-conversion
scripts
causal
compare-embeddings-logits.sh compare-logits.py convert-model.sh modelcard.template run-casual-gen-embeddings-org.py run-converted-model-embeddings-logits.sh run-converted-model.sh run-org-model.pyembedding
compare-embeddings-logits.sh convert-model.sh modelcard.template run-converted-model.sh run-original-model.pyutils
__init__.py check-nmse.py common.py compare_tokens.py create-collection-add-model.sh curl-embedding-server.sh hf-add-model-to-collection.py hf-create-collection.py hf-create-model.py hf-upload-gguf-model.py inspect-converted-model.sh inspect-org-model.py perplexity-gen.sh perplexity-run-simple.sh perplexity-run.sh quantize.sh run-embedding-server.sh semantic_check.py tensor-info.pysycl
CMakeLists.txt README.md build.sh ls-sycl-device.cpp run-llama2.sh test.sh win-build-sycl.bat win-run-llama2.bat win-test.batggml
include
ggml-alloc.h ggml-backend.h ggml-blas.h ggml-cann.h ggml-cpp.h ggml-cpu.h ggml-cuda.h ggml-hexagon.h ggml-metal.h ggml-opencl.h ggml-opt.h ggml-rpc.h ggml-sycl.h ggml-virtgpu.h ggml-vulkan.h ggml-webgpu.h ggml-zdnn.h ggml-zendnn.h ggml.h gguf.hsrc
ggml-cann
CMakeLists.txt acl_tensor.cpp acl_tensor.h aclnn_ops.cpp aclnn_ops.h common.h ggml-cann.cppggml-cpu
CMakeLists.txt arch-fallback.h binary-ops.cpp binary-ops.h common.h ggml-cpu-impl.h ggml-cpu.c ggml-cpu.cpp hbm.cpp hbm.h ops.cpp ops.h quants.c quants.h repack.cpp repack.h simd-mappings.h traits.cpp traits.h unary-ops.cpp unary-ops.h vec.cpp vec.hggml-cuda
template-instances
fattn-mma-f16-instance-ncols1_1-ncols2_16.cu fattn-mma-f16-instance-ncols1_1-ncols2_32.cu fattn-mma-f16-instance-ncols1_1-ncols2_8.cu fattn-mma-f16-instance-ncols1_16-ncols2_1.cu fattn-mma-f16-instance-ncols1_16-ncols2_2.cu fattn-mma-f16-instance-ncols1_16-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_16.cu fattn-mma-f16-instance-ncols1_2-ncols2_32.cu fattn-mma-f16-instance-ncols1_2-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_8.cu fattn-mma-f16-instance-ncols1_32-ncols2_1.cu fattn-mma-f16-instance-ncols1_32-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_16.cu fattn-mma-f16-instance-ncols1_4-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_4.cu fattn-mma-f16-instance-ncols1_4-ncols2_8.cu fattn-mma-f16-instance-ncols1_64-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_2.cu fattn-mma-f16-instance-ncols1_8-ncols2_4.cu fattn-mma-f16-instance-ncols1_8-ncols2_8.cu fattn-tile-instance-dkq112-dv112.cu fattn-tile-instance-dkq128-dv128.cu fattn-tile-instance-dkq256-dv256.cu fattn-tile-instance-dkq40-dv40.cu fattn-tile-instance-dkq576-dv512.cu fattn-tile-instance-dkq64-dv64.cu fattn-tile-instance-dkq72-dv72.cu fattn-tile-instance-dkq80-dv80.cu fattn-tile-instance-dkq96-dv96.cu fattn-vec-instance-f16-f16.cu fattn-vec-instance-f16-q4_0.cu fattn-vec-instance-f16-q4_1.cu fattn-vec-instance-f16-q5_0.cu fattn-vec-instance-f16-q5_1.cu fattn-vec-instance-f16-q8_0.cu fattn-vec-instance-q4_0-f16.cu fattn-vec-instance-q4_0-q4_0.cu fattn-vec-instance-q4_0-q4_1.cu fattn-vec-instance-q4_0-q5_0.cu fattn-vec-instance-q4_0-q5_1.cu fattn-vec-instance-q4_0-q8_0.cu fattn-vec-instance-q4_1-f16.cu fattn-vec-instance-q4_1-q4_0.cu fattn-vec-instance-q4_1-q4_1.cu fattn-vec-instance-q4_1-q5_0.cu fattn-vec-instance-q4_1-q5_1.cu fattn-vec-instance-q4_1-q8_0.cu fattn-vec-instance-q5_0-f16.cu fattn-vec-instance-q5_0-q4_0.cu fattn-vec-instance-q5_0-q4_1.cu fattn-vec-instance-q5_0-q5_0.cu fattn-vec-instance-q5_0-q5_1.cu fattn-vec-instance-q5_0-q8_0.cu fattn-vec-instance-q5_1-f16.cu fattn-vec-instance-q5_1-q4_0.cu fattn-vec-instance-q5_1-q4_1.cu fattn-vec-instance-q5_1-q5_0.cu fattn-vec-instance-q5_1-q5_1.cu fattn-vec-instance-q5_1-q8_0.cu fattn-vec-instance-q8_0-f16.cu fattn-vec-instance-q8_0-q4_0.cu fattn-vec-instance-q8_0-q4_1.cu fattn-vec-instance-q8_0-q5_0.cu fattn-vec-instance-q8_0-q5_1.cu fattn-vec-instance-q8_0-q8_0.cu generate_cu_files.py mmf-instance-ncols_1.cu mmf-instance-ncols_10.cu mmf-instance-ncols_11.cu mmf-instance-ncols_12.cu mmf-instance-ncols_13.cu mmf-instance-ncols_14.cu mmf-instance-ncols_15.cu mmf-instance-ncols_16.cu mmf-instance-ncols_2.cu mmf-instance-ncols_3.cu mmf-instance-ncols_4.cu mmf-instance-ncols_5.cu mmf-instance-ncols_6.cu mmf-instance-ncols_7.cu mmf-instance-ncols_8.cu mmf-instance-ncols_9.cu mmq-instance-iq1_s.cu mmq-instance-iq2_s.cu mmq-instance-iq2_xs.cu mmq-instance-iq2_xxs.cu mmq-instance-iq3_s.cu mmq-instance-iq3_xxs.cu mmq-instance-iq4_nl.cu mmq-instance-iq4_xs.cu mmq-instance-mxfp4.cu mmq-instance-q2_k.cu mmq-instance-q3_k.cu mmq-instance-q4_0.cu mmq-instance-q4_1.cu mmq-instance-q4_k.cu mmq-instance-q5_0.cu mmq-instance-q5_1.cu mmq-instance-q5_k.cu mmq-instance-q6_k.cu mmq-instance-q8_0.cuggml-hexagon
htp
CMakeLists.txt act-ops.c argsort-ops.c binary-ops.c cmake-toolchain.cmake cpy-ops.c flash-attn-ops.c get-rows-ops.c hex-dma.c hex-dma.h hex-dump.h hex-fastdiv.h hex-utils.h htp-ctx.h htp-msg.h htp-ops.h htp_iface.idl hvx-arith.h hvx-base.h hvx-copy.h hvx-div.h hvx-dump.h hvx-exp.h hvx-floor.h hvx-inverse.h hvx-reduce.h hvx-scale.h hvx-sigmoid.h hvx-sqrt.h hvx-types.h hvx-utils.h main.c matmul-ops.c rope-ops.c set-rows-ops.c softmax-ops.c sum-rows-ops.c unary-ops.c worker-pool.c worker-pool.hggml-metal
CMakeLists.txt ggml-metal-common.cpp ggml-metal-common.h ggml-metal-context.h ggml-metal-context.m ggml-metal-device.cpp ggml-metal-device.h ggml-metal-device.m ggml-metal-impl.h ggml-metal-ops.cpp ggml-metal-ops.h ggml-metal.cpp ggml-metal.metalggml-opencl
kernels
add.cl add_id.cl argsort.cl clamp.cl concat.cl conv2d.cl conv2d_f16_f32.cl cpy.cl cvt.cl diag_mask_inf.cl div.cl embed_kernel.py expm1.cl fill.cl flash_attn_f16.cl flash_attn_f32.cl flash_attn_f32_f16.cl gelu.cl gemm_moe_mxfp4_f32.cl gemv_moe_mxfp4_f32.cl gemv_noshuffle.cl gemv_noshuffle_general.cl gemv_noshuffle_general_q8_0_f32.cl get_rows.cl glu.cl group_norm.cl im2col_f16.cl im2col_f32.cl mean.cl mul.cl mul_mat_Ab_Bi_8x4.cl mul_mat_f16_f32.cl mul_mm_f16_f32_kq_kqv.cl mul_mm_f16_f32_l4_lm.cl mul_mm_f32_f32_l4_lm.cl mul_mm_q6_k_f32_l4_lm.cl mul_mm_q8_0_f32_8x4.cl mul_mm_q8_0_f32_l4_lm.cl mul_mv_f16_f16.cl mul_mv_f16_f32.cl mul_mv_f16_f32_1row.cl mul_mv_f16_f32_l4.cl mul_mv_f32_f32.cl mul_mv_id_mxfp4_f32.cl mul_mv_id_mxfp4_f32_flat.cl mul_mv_id_q4_0_f32_8x_flat.cl mul_mv_id_q8_0_f32.cl mul_mv_id_q8_0_f32_flat.cl mul_mv_mxfp4_f32.cl mul_mv_mxfp4_f32_flat.cl mul_mv_q4_0_f32.cl mul_mv_q4_0_f32_1d_16x_flat.cl mul_mv_q4_0_f32_1d_8x_flat.cl mul_mv_q4_0_f32_8x_flat.cl mul_mv_q4_0_f32_v.cl mul_mv_q4_k_f32.cl mul_mv_q6_k_f32.cl mul_mv_q6_k_f32_flat.cl mul_mv_q8_0_f32.cl mul_mv_q8_0_f32_flat.cl norm.cl pad.cl relu.cl repeat.cl rms_norm.cl rope.cl scale.cl set_rows.cl sigmoid.cl silu.cl softmax_4_f16.cl softmax_4_f32.cl softmax_f16.cl softmax_f32.cl softplus.cl solve_tri.cl sqr.cl sqrt.cl ssm_conv.cl sub.cl sum_rows.cl tanh.cl transpose.cl tri.cl tsembd.cl upscale.clggml-sycl
CMakeLists.txt add-id.cpp add-id.hpp backend.hpp binbcast.cpp binbcast.hpp common.cpp common.hpp concat.cpp concat.hpp conv.cpp conv.hpp convert.cpp convert.hpp count-equal.cpp count-equal.hpp cpy.cpp cpy.hpp dequantize.hpp dmmv.cpp dmmv.hpp element_wise.cpp element_wise.hpp gemm.hpp getrows.cpp getrows.hpp ggml-sycl.cpp gla.cpp gla.hpp im2col.cpp im2col.hpp mmq.cpp mmq.hpp mmvq.cpp mmvq.hpp norm.cpp norm.hpp outprod.cpp outprod.hpp pad.cpp pad.hpp pad_reflect_1d.cpp pad_reflect_1d.hpp presets.hpp quantize.hpp quants.hpp repeat_back.cpp repeat_back.hpp roll.cpp roll.hpp rope.cpp rope.hpp set.cpp set.hpp set_rows.cpp set_rows.hpp softmax.cpp softmax.hpp ssm_conv.cpp ssm_conv.hpp sycl_hw.cpp sycl_hw.hpp tsembd.cpp tsembd.hpp vecdotq.hpp wkv.cpp wkv.hppggml-virtgpu
backend
CMakeLists.txt apir_cs_ggml-rpc-back.cpp backend-convert.h backend-dispatched-backend.cpp backend-dispatched-buffer-type.cpp backend-dispatched-buffer.cpp backend-dispatched-device.cpp backend-dispatched.cpp backend-dispatched.gen.h backend-dispatched.h backend-virgl-apir.h backend.cppggml-vulkan
vulkan-shaders
CMakeLists.txt abs.comp acc.comp add.comp add1.comp add_id.comp arange.comp argmax.comp argsort.comp argsort_large.comp ceil.comp clamp.comp concat.comp contig_copy.comp conv2d_dw.comp conv2d_mm.comp conv_transpose_1d.comp copy.comp copy_from_quant.comp copy_to_quant.comp copy_transpose.comp cos.comp count_equal.comp count_experts.comp cumsum.comp cumsum_multipass1.comp cumsum_multipass2.comp dequant_f32.comp dequant_funcs.glsl dequant_funcs_cm2.glsl dequant_head.glsl dequant_iq1_m.comp dequant_iq1_s.comp dequant_iq2_s.comp dequant_iq2_xs.comp dequant_iq2_xxs.comp dequant_iq3_s.comp dequant_iq3_xxs.comp dequant_iq4_nl.comp dequant_iq4_xs.comp dequant_mxfp4.comp dequant_q2_k.comp dequant_q3_k.comp dequant_q4_0.comp dequant_q4_1.comp dequant_q4_k.comp dequant_q5_0.comp dequant_q5_1.comp dequant_q5_k.comp dequant_q6_k.comp dequant_q8_0.comp diag.comp diag_mask_inf.comp div.comp exp.comp fill.comp flash_attn.comp flash_attn_base.glsl flash_attn_cm1.comp flash_attn_cm2.comp flash_attn_mask_opt.comp flash_attn_split_k_reduce.comp floor.comp geglu.comp geglu_erf.comp geglu_quick.comp gelu.comp gelu_erf.comp gelu_quick.comp generic_binary_head.glsl generic_head.glsl generic_unary_head.glsl get_rows.comp get_rows_quant.comp glu_head.glsl glu_main.glsl group_norm.comp hardsigmoid.comp hardswish.comp im2col.comp im2col_3d.comp l2_norm.comp leaky_relu.comp log.comp mul.comp mul_mat_split_k_reduce.comp mul_mat_vec.comp mul_mat_vec_base.glsl mul_mat_vec_iface.glsl mul_mat_vec_iq1_m.comp mul_mat_vec_iq1_s.comp mul_mat_vec_iq2_s.comp mul_mat_vec_iq2_xs.comp mul_mat_vec_iq2_xxs.comp mul_mat_vec_iq3_s.comp mul_mat_vec_iq3_xxs.comp mul_mat_vec_nc.comp mul_mat_vec_p021.comp mul_mat_vec_q2_k.comp mul_mat_vec_q3_k.comp mul_mat_vec_q4_k.comp mul_mat_vec_q5_k.comp mul_mat_vec_q6_k.comp mul_mat_vecq.comp mul_mat_vecq_funcs.glsl mul_mm.comp mul_mm_cm2.comp mul_mm_funcs.glsl mul_mm_id_funcs.glsl mul_mmq.comp mul_mmq_funcs.glsl mul_mmq_shmem_types.glsl multi_add.comp neg.comp norm.comp opt_step_adamw.comp opt_step_sgd.comp pad.comp pool2d.comp quantize_q8_1.comp reglu.comp relu.comp repeat.comp repeat_back.comp rms_norm.comp rms_norm_back.comp rms_norm_partials.comp roll.comp rope_funcs.glsl rope_head.glsl rope_multi.comp rope_neox.comp rope_norm.comp rope_params.glsl rope_vision.comp round.comp rte.glsl scale.comp sigmoid.comp silu.comp silu_back.comp sin.comp soft_max.comp soft_max_back.comp soft_max_large1.comp soft_max_large2.comp soft_max_large3.comp soft_max_large_common.glsl softplus.comp solve_tri.comp sqrt.comp square.comp ssm_conv.comp ssm_scan.comp step.comp sub.comp sum_rows.comp sum_rows.glsl swiglu.comp swiglu_oai.comp tanh.comp timestep_embedding.comp topk_argsort.comp topk_moe.comp topk_nary_search.comp tri.comp trunc.comp types.glsl upscale.comp utils.glsl vulkan-shaders-gen.cpp wkv6.comp wkv7.comp xielu.compggml-webgpu
wgsl-shaders
argmax.wgsl argsort.wgsl argsort_merge.wgsl binary.wgsl common_decls.tmpl cpy.tmpl.wgsl cumsum.wgsl embed_wgsl.py flash_attn.wgsl get_rows.tmpl.wgsl glu.tmpl.wgsl memset.wgsl mul_mat.tmpl.wgsl mul_mat_decls.tmpl mul_mat_reg_tile.tmpl.wgsl mul_mat_subgroup_matrix.tmpl.wgsl mul_mat_vec.tmpl.wgsl pad.wgsl rms_norm.wgsl rope.tmpl.wgsl scale.tmpl.wgsl set_rows.wgsl soft_max.tmpl.wgsl sum_rows.wgsl unary.wgslgguf-py
gguf
scripts
gguf_convert_endian.py gguf_dump.py gguf_editor_gui.py gguf_hash.py gguf_new_metadata.py gguf_set_metadata.pygrammars
README.md arithmetic.gbnf c.gbnf chess.gbnf english.gbnf japanese.gbnf json.gbnf json_arr.gbnf list.gbnfmedia
llama0-banner.png llama0-logo.png llama1-banner.png llama1-icon-transparent.png llama1-icon-transparent.svg llama1-icon.png llama1-icon.svg llama1-logo.png llama1-logo.svg matmul.png matmul.svgmodels
templates
Apertus-8B-Instruct.jinja ByteDance-Seed-OSS.jinja CohereForAI-c4ai-command-r-plus-tool_use.jinja CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja GLM-4.6.jinja Kimi-K2-Instruct.jinja Kimi-K2-Thinking.jinja MiMo-VL.jinja MiniMax-M2.jinja Mistral-Small-3.2-24B-Instruct-2506.jinja NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja NVIDIA-Nemotron-Nano-v2.jinja NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja Qwen-QwQ-32B.jinja Qwen-Qwen2.5-7B-Instruct.jinja Qwen-Qwen3-0.6B.jinja Qwen3-Coder.jinja README.md deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja deepseek-ai-DeepSeek-V3.1.jinja fireworks-ai-llama-3-firefunction-v2.jinja google-gemma-2-2b-it.jinja ibm-granite-granite-3.3-2B-Instruct.jinja llama-cpp-deepseek-r1.jinja llama-cpp-lfm2.jinja llama-cpp-rwkv-world.jinja meetkai-functionary-medium-v3.1.jinja meetkai-functionary-medium-v3.2.jinja meta-llama-Llama-3.1-8B-Instruct.jinja meta-llama-Llama-3.2-3B-Instruct.jinja meta-llama-Llama-3.3-70B-Instruct.jinja microsoft-Phi-3.5-mini-instruct.jinja mistralai-Ministral-3-14B-Reasoning-2512.jinja mistralai-Mistral-Nemo-Instruct-2407.jinja moonshotai-Kimi-K2.jinja openai-gpt-oss-120b.jinja unsloth-Apriel-1.5.jinja unsloth-mistral-Devstral-Small-2507.jinja upstage-Solar-Open-100B.jinjarequirements
requirements-all.txt requirements-compare-llama-bench.txt requirements-convert_hf_to_gguf.txt requirements-convert_hf_to_gguf_update.txt requirements-convert_legacy_llama.txt requirements-convert_llama_ggml_to_gguf.txt requirements-convert_lora_to_gguf.txt requirements-gguf_editor_gui.txt requirements-pydantic.txt requirements-server-bench.txt requirements-test-tokenizer-random.txt requirements-tool_bench.txtscripts
bench-models.sh build-info.sh check-requirements.sh compare-commits.sh compare-llama-bench.py compare-logprobs.py create_ops_docs.py debug-test.sh fetch_server_test_models.py gen-authors.sh gen-unicode-data.py get-flags.mk get-hellaswag.sh get-pg.sh get-wikitext-103.sh get-wikitext-2.sh get-winogrande.sh get_chat_template.py hf.sh install-oneapi.bat pr2wt.sh serve-static.js server-bench.py sync-ggml-am.sh sync-ggml.last sync-ggml.sh sync_vendor.py tool_bench.py tool_bench.sh verify-checksum-models.py xxd.cmakesrc
models
afmoe.cpp apertus.cpp arcee.cpp arctic.cpp arwkv7.cpp baichuan.cpp bailingmoe.cpp bailingmoe2.cpp bert.cpp bitnet.cpp bloom.cpp chameleon.cpp chatglm.cpp codeshell.cpp cogvlm.cpp cohere2-iswa.cpp command-r.cpp dbrx.cpp deci.cpp deepseek.cpp deepseek2.cpp dots1.cpp dream.cpp ernie4-5-moe.cpp ernie4-5.cpp exaone-moe.cpp exaone.cpp exaone4.cpp falcon-h1.cpp falcon.cpp gemma-embedding.cpp gemma.cpp gemma2-iswa.cpp gemma3.cpp gemma3n-iswa.cpp glm4-moe.cpp glm4.cpp gpt2.cpp gptneox.cpp granite-hybrid.cpp granite.cpp graph-context-mamba.cpp grok.cpp grovemoe.cpp hunyuan-dense.cpp hunyuan-moe.cpp internlm2.cpp jais.cpp jamba.cpp kimi-linear.cpp lfm2.cpp llada-moe.cpp llada.cpp llama-iswa.cpp llama.cpp maincoder.cpp mamba.cpp mimo2-iswa.cpp minicpm3.cpp minimax-m2.cpp mistral3.cpp models.h modern-bert.cpp mpt.cpp nemotron-h.cpp nemotron.cpp neo-bert.cpp olmo.cpp olmo2.cpp olmoe.cpp openai-moe-iswa.cpp openelm.cpp orion.cpp pangu-embedded.cpp phi2.cpp phi3.cpp plamo.cpp plamo2.cpp plamo3.cpp plm.cpp qwen.cpp qwen2.cpp qwen2moe.cpp qwen2vl.cpp qwen3.cpp qwen35.cpp qwen35moe.cpp qwen3moe.cpp qwen3next.cpp qwen3vl-moe.cpp qwen3vl.cpp refact.cpp rnd1.cpp rwkv6-base.cpp rwkv6.cpp rwkv6qwen2.cpp rwkv7-base.cpp rwkv7.cpp seed-oss.cpp smallthinker.cpp smollm3.cpp stablelm.cpp starcoder.cpp starcoder2.cpp step35-iswa.cpp t5-dec.cpp t5-enc.cpp wavtokenizer-dec.cpp xverse.cpptests
peg-parser
simple-tokenize.cpp simple-tokenize.h test-basic.cpp test-gbnf-generation.cpp test-json-parser.cpp test-json-serialization.cpp test-unicode.cpp tests.htools
cvector-generator
CMakeLists.txt README.md completions.txt cvector-generator.cpp mean.hpp negative.txt pca.hpp positive.txtmtmd
legacy-models
convert_image_encoder_to_gguf.py glmedge-convert-image-encoder-to-gguf.py glmedge-surgery.py llava_surgery.py llava_surgery_v2.py minicpmv-convert-image-encoder-to-gguf.py minicpmv-surgery.pymodels
cogvlm.cpp conformer.cpp glm4v.cpp internvl.cpp kimik25.cpp kimivl.cpp llama4.cpp llava.cpp minicpmv.cpp mobilenetv5.cpp models.h pixtral.cpp qwen2vl.cpp qwen3vl.cpp siglip.cpp whisper-enc.cpp youtuvl.cppserver
public_legacy
colorthemes.css completion.js favicon.ico index-new.html index.html index.js json-schema-to-grammar.mjs loading.html prompt-formats.js style.css system-prompts.js theme-beeninorder.css theme-ketivah.css theme-mangotango.css theme-playground.css theme-polarnight.css theme-snowstorm.csspublic_simplechat
datautils.mjs index.html readme.md simplechat.css simplechat.js simplechat_screens.webp ui.mjstests
unit
test_basic.py test_chat_completion.py test_compat_anthropic.py test_compat_oai_responses.py test_completion.py test_ctx_shift.py test_embedding.py test_infill.py test_lora.py test_rerank.py test_router.py test_security.py test_sleep.py test_slot_save.py test_speculative.py test_template.py test_tokenize.py test_tool_call.py test_vision_api.pywebui
.storybook
ModeWatcherDecorator.svelte TooltipProviderDecorator.svelte main.ts preview.ts vitest.setup.tssrc
lib
components
app
chat
ChatAttachments
ChatAttachmentPreview.svelte ChatAttachmentThumbnailFile.svelte ChatAttachmentThumbnailImage.svelte ChatAttachmentsList.svelte ChatAttachmentsViewAll.svelteChatForm
ChatFormActions
ChatFormActionFileAttachments.svelte ChatFormActionRecord.svelte ChatFormActionSubmit.svelte ChatFormActions.svelteChatMessages
ChatMessage.svelte ChatMessageActions.svelte ChatMessageAssistant.svelte ChatMessageBranchingControls.svelte ChatMessageEditForm.svelte ChatMessageStatistics.svelte ChatMessageSystem.svelte ChatMessageThinkingBlock.svelte ChatMessageUser.svelte ChatMessages.svelteChatScreen
ChatScreen.svelte ChatScreenDragOverlay.svelte ChatScreenHeader.svelte ChatScreenProcessingInfo.sveltedialogs
DialogChatAttachmentPreview.svelte DialogChatAttachmentsViewAll.svelte DialogChatError.svelte DialogChatSettings.svelte DialogConfirmation.svelte DialogConversationSelection.svelte DialogConversationTitleUpdate.svelte DialogEmptyFileAlert.svelte DialogModelInformation.svelte DialogModelNotAvailable.sveltemisc
ActionButton.svelte ActionDropdown.svelte BadgeChatStatistic.svelte BadgeInfo.svelte BadgeModality.svelte CodePreviewDialog.svelte ConversationSelection.svelte CopyToClipboardIcon.svelte KeyboardShortcutInfo.svelte MarkdownContent.svelte RemoveButton.svelte SearchInput.svelte SyntaxHighlightedCode.svelteui
alert-dialog
alert-dialog-action.svelte alert-dialog-cancel.svelte alert-dialog-content.svelte alert-dialog-description.svelte alert-dialog-footer.svelte alert-dialog-header.svelte alert-dialog-overlay.svelte alert-dialog-title.svelte alert-dialog-trigger.svelte index.tscard
card-action.svelte card-content.svelte card-description.svelte card-footer.svelte card-header.svelte card-title.svelte card.svelte index.tsdialog
dialog-close.svelte dialog-content.svelte dialog-description.svelte dialog-footer.svelte dialog-header.svelte dialog-overlay.svelte dialog-title.svelte dialog-trigger.svelte index.tsdropdown-menu
dropdown-menu-checkbox-item.svelte dropdown-menu-content.svelte dropdown-menu-group-heading.svelte dropdown-menu-group.svelte dropdown-menu-item.svelte dropdown-menu-label.svelte dropdown-menu-radio-group.svelte dropdown-menu-radio-item.svelte dropdown-menu-separator.svelte dropdown-menu-shortcut.svelte dropdown-menu-sub-content.svelte dropdown-menu-sub-trigger.svelte dropdown-menu-trigger.svelte index.tspopover
index.ts popover-close.svelte popover-content.svelte popover-portal.svelte popover-trigger.svelte popover.svelteselect
index.ts select-content.svelte select-group-heading.svelte select-group.svelte select-item.svelte select-label.svelte select-scroll-down-button.svelte select-scroll-up-button.svelte select-separator.svelte select-trigger.sveltesheet
index.ts sheet-close.svelte sheet-content.svelte sheet-description.svelte sheet-footer.svelte sheet-header.svelte sheet-overlay.svelte sheet-title.svelte sheet-trigger.sveltesidebar
constants.ts context.svelte.ts index.ts sidebar-content.svelte sidebar-footer.svelte sidebar-group-action.svelte sidebar-group-content.svelte sidebar-group-label.svelte sidebar-group.svelte sidebar-header.svelte sidebar-input.svelte sidebar-inset.svelte sidebar-menu-action.svelte sidebar-menu-badge.svelte sidebar-menu-button.svelte sidebar-menu-item.svelte sidebar-menu-skeleton.svelte sidebar-menu-sub-button.svelte sidebar-menu-sub-item.svelte sidebar-menu-sub.svelte sidebar-menu.svelte sidebar-provider.svelte sidebar-rail.svelte sidebar-separator.svelte sidebar-trigger.svelte sidebar.sveltetable
index.ts table-body.svelte table-caption.svelte table-cell.svelte table-footer.svelte table-head.svelte table-header.svelte table-row.svelte table.svelteconstants
auto-scroll.ts binary-detection.ts default-context.ts floating-ui-constraints.ts icons.ts input-classes.ts latex-protection.ts literal-html.ts localstorage-keys.ts max-bundle-size.ts precision.ts processing-info.ts settings-config.ts supported-file-types.ts table-html-restorer.ts tooltip-config.ts viewport.tsstores
chat.svelte.ts conversations.svelte.ts models.svelte.ts persisted.svelte.ts server.svelte.ts settings.svelte.tsutils
api-headers.ts api-key-validation.ts attachment-display.ts attachment-type.ts audio-recording.ts autoresize-textarea.ts branching.ts browser-only.ts clipboard.ts config-helpers.ts conversation-utils.ts convert-files-to-extra.ts file-preview.ts file-type.ts formatters.ts index.ts is-ime-composing.ts latex-protection.ts modality-file-validation.ts model-names.ts pdf-processing.ts portal-to-body.ts precision.ts process-uploaded-files.ts svg-to-png.ts syntax-highlight-language.ts text-files.ts text.ts webp-to-png.tstests
llama.cpp/src/llama-model.cpp
raw
1#include "llama-model.h"
2
3#include "llama-impl.h"
4#include "llama-mmap.h"
5#include "llama-cparams.h"
6#include "llama-model-loader.h"
7
8#include "llama-kv-cache.h"
9#include "llama-kv-cache-iswa.h"
10#include "llama-memory-hybrid.h"
11#include "llama-memory-hybrid-iswa.h"
12#include "llama-memory-recurrent.h"
13
14#include "ggml-cpp.h"
15
16#include "models/models.h"
17
18#include <algorithm>
19#include <cassert>
20#include <cfloat>
21#include <cstring>
22#include <cmath>
23#include <functional>
24#include <map>
25#include <regex>
26#include <sstream>
27#include <stdexcept>
28
29const char * llm_type_name(llm_type type) {
30 switch (type) {
31 case LLM_TYPE_14M: return "14M";
32 case LLM_TYPE_17M: return "17M";
33 case LLM_TYPE_22M: return "22M";
34 case LLM_TYPE_33M: return "33M";
35 case LLM_TYPE_47M: return "47M";
36 case LLM_TYPE_60M: return "60M";
37 case LLM_TYPE_70M: return "70M";
38 case LLM_TYPE_80M: return "80M";
39 case LLM_TYPE_109M: return "109M";
40 case LLM_TYPE_137M: return "137M";
41 case LLM_TYPE_140M: return "140M";
42 case LLM_TYPE_149M: return "149M";
43 case LLM_TYPE_160M: return "160M";
44 case LLM_TYPE_190M: return "190M";
45 case LLM_TYPE_220M: return "220M";
46 case LLM_TYPE_250M: return "250M";
47 case LLM_TYPE_256M: return "256M";
48 case LLM_TYPE_270M: return "270M";
49 case LLM_TYPE_335M: return "335M";
50 case LLM_TYPE_350M: return "350M";
51 case LLM_TYPE_360M: return "360M";
52 case LLM_TYPE_395M: return "395M";
53 case LLM_TYPE_410M: return "410M";
54 case LLM_TYPE_450M: return "450M";
55 case LLM_TYPE_475M: return "475M";
56 case LLM_TYPE_558M: return "558M";
57 case LLM_TYPE_700M: return "700M";
58 case LLM_TYPE_770M: return "770M";
59 case LLM_TYPE_780M: return "780M";
60 case LLM_TYPE_950M: return "950M";
61 case LLM_TYPE_0_3B: return "0.3B";
62 case LLM_TYPE_0_5B: return "0.5B";
63 case LLM_TYPE_0_6B: return "0.6B";
64 case LLM_TYPE_1B: return "1B";
65 case LLM_TYPE_1_2B: return "1.2B";
66 case LLM_TYPE_1_3B: return "1.3B";
67 case LLM_TYPE_1_4B: return "1.4B";
68 case LLM_TYPE_1_5B: return "1.5B";
69 case LLM_TYPE_1_6B: return "1.6B";
70 case LLM_TYPE_1_7B: return "1.7B";
71 case LLM_TYPE_1_8B: return "1.8B";
72 case LLM_TYPE_2B: return "2B";
73 case LLM_TYPE_2_6B: return "2.6B";
74 case LLM_TYPE_2_8B: return "2.8B";
75 case LLM_TYPE_2_9B: return "2.9B";
76 case LLM_TYPE_3B: return "3B";
77 case LLM_TYPE_4B: return "4B";
78 case LLM_TYPE_6B: return "6B";
79 case LLM_TYPE_6_9B: return "6.9B";
80 case LLM_TYPE_7B: return "7B";
81 case LLM_TYPE_8B: return "8B";
82 case LLM_TYPE_9B: return "9B";
83 case LLM_TYPE_11B: return "11B";
84 case LLM_TYPE_12B: return "12B";
85 case LLM_TYPE_13B: return "13B";
86 case LLM_TYPE_14B: return "14B";
87 case LLM_TYPE_15B: return "15B";
88 case LLM_TYPE_16B: return "16B";
89 case LLM_TYPE_20B: return "20B";
90 case LLM_TYPE_26B: return "26B";
91 case LLM_TYPE_27B: return "27B";
92 case LLM_TYPE_30B: return "30B";
93 case LLM_TYPE_32B: return "32B";
94 case LLM_TYPE_34B: return "34B";
95 case LLM_TYPE_35B: return "35B";
96 case LLM_TYPE_36B: return "36B";
97 case LLM_TYPE_40B: return "40B";
98 case LLM_TYPE_65B: return "65B";
99 case LLM_TYPE_70B: return "70B";
100 case LLM_TYPE_120B: return "120B";
101 case LLM_TYPE_142B: return "142B";
102 case LLM_TYPE_236B: return "236B";
103 case LLM_TYPE_290B: return "290B";
104 case LLM_TYPE_314B: return "314B";
105 case LLM_TYPE_405B: return "405B";
106 case LLM_TYPE_671B: return "671B";
107 case LLM_TYPE_SMALL: return "0.1B";
108 case LLM_TYPE_MEDIUM: return "0.4B";
109 case LLM_TYPE_LARGE: return "0.8B";
110 case LLM_TYPE_XL: return "1.5B";
111 case LLM_TYPE_A1_7B: return "A1.7B";
112 case LLM_TYPE_A2_7B: return "A2.7B";
113 case LLM_TYPE_8x7B: return "8x7B";
114 case LLM_TYPE_8x22B: return "8x22B";
115 case LLM_TYPE_16x12B: return "16x12B";
116 case LLM_TYPE_16x3_8B: return "16x3.8B";
117 case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
118 case LLM_TYPE_57B_A14B: return "57B.A14B";
119 case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
120 case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
121 case LLM_TYPE_A13B: return "A13B";
122 case LLM_TYPE_7B_A1B: return "7B.A1B";
123 case LLM_TYPE_8B_A1B: return "8B.A1B";
124 case LLM_TYPE_16B_A1B: return "16B.A1B";
125 case LLM_TYPE_21B_A3B: return "21B.A3B";
126 case LLM_TYPE_30B_A3B: return "30B.A3B";
127 case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
128 case LLM_TYPE_35B_A3B: return "35B.A3B";
129 case LLM_TYPE_48B_A3B: return "48B.A3B";
130 case LLM_TYPE_80B_A3B: return "80B.A3B";
131 case LLM_TYPE_100B_A6B: return "100B.A6B";
132 case LLM_TYPE_102B_A12B: return "102B.A12B";
133 case LLM_TYPE_106B_A12B: return "106B.A12B";
134 case LLM_TYPE_196B_A11B: return "196B.A11B";
135 case LLM_TYPE_230B_A10B: return "230B.A10B";
136 case LLM_TYPE_235B_A22B: return "235B.A22B";
137 case LLM_TYPE_300B_A47B: return "300B.A47B";
138 case LLM_TYPE_310B_A15B: return "310B.A15B";
139 case LLM_TYPE_355B_A32B: return "355B.A32B";
140 case LLM_TYPE_E2B: return "E2B";
141 case LLM_TYPE_E4B: return "E4B";
142 default: return "?B";
143 }
144}
145
146static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
147 switch (type) {
148 case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
149 case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
150 default: return "unknown";
151 }
152}
153
154static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
155 { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
156 { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
157 { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
158 { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
159};
160
161std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
162 return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
163}
164
165static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
166 for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
167 if (kv.second == name) {
168 return (llama_rope_scaling_type) kv.first;
169 }
170 }
171
172 return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
173}
174
175// checks if the weight tensor can be used with the specified buffer type and device
176static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
177 GGML_ASSERT(w != nullptr);
178
179 if (op == GGML_OP_NONE) {
180 return true;
181 }
182
183 ggml_init_params params = {
184 /*.mem_size =*/ ggml_tensor_overhead()*8,
185 /*.mem_buffer =*/ NULL,
186 /*.no_alloc =*/ true,
187 };
188 ggml_context_ptr ctx_ptr { ggml_init(params) };
189 if (!ctx_ptr) {
190 throw std::runtime_error(format("failed to create ggml context"));
191 }
192 ggml_context * ctx = ctx_ptr.get();
193
194 ggml_tensor * op_tensor = nullptr;
195
196 switch (op) {
197 case GGML_OP_GET_ROWS:
198 {
199 ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
200 op_tensor = ggml_get_rows(ctx, w, b);
201 } break;
202 case GGML_OP_MUL_MAT:
203 {
204 ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
205 op_tensor = ggml_mul_mat(ctx, w, b);
206 } break;
207 case GGML_OP_MUL_MAT_ID:
208 {
209 int n_expert_used = hparams.n_expert_used;
210 ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
211 ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
212 op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
213 } break;
214 case GGML_OP_ADD:
215 {
216 ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
217 op_tensor = ggml_add(ctx, a, w);
218 } break;
219 case GGML_OP_ADD_ID:
220 {
221 int n_expert_used = hparams.n_expert_used;
222 ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
223 ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
224 op_tensor = ggml_add_id(ctx, a, w, c);
225 } break;
226 case GGML_OP_MUL:
227 {
228 ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
229 op_tensor = ggml_mul(ctx, a, w);
230 } break;
231 case GGML_OP_DIV:
232 {
233 ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
234 op_tensor = ggml_div(ctx, a, w);
235 } break;
236 case GGML_OP_ROPE:
237 {
238 int n_embd_head = hparams.n_embd_head_v;
239 int n_head = hparams.n_head();
240 ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
241 ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
242 op_tensor = ggml_rope_ext(
243 ctx, a, b, w,
244 0, 0, 0, 0, 0,
245 0, 0, 0, 0
246 );
247
248 } break;
249 case GGML_OP_SSM_CONV:
250 {
251 const int64_t n_seq_tokens = 512;
252 const int64_t n_seqs = 3;
253 ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
254 op_tensor = ggml_ssm_conv(ctx, conv_x, w);
255 } break;
256 case GGML_OP_SSM_SCAN:
257 {
258 // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
259 const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
260 const int64_t n_head = w->ne[1];
261 const int64_t head_dim = hparams.ssm_d_inner / n_head;
262 const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
263 const int64_t n_seq_tokens = 512;
264 const int64_t n_seqs = 3;
265 ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
266 ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
267 ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
268 ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
269 ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
270 ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
271 op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
272 } break;
273 case GGML_OP_RWKV_WKV6:
274 {
275 // FIXME
276 const int64_t S = 123;
277 const int64_t H = 123;
278 const int64_t n_tokens = 123;
279 const int64_t n_seqs = 123;
280 ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
281 ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
282 ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
283 ggml_tensor * tf = w;
284 ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
285 ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
286 op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
287 } break;
288 case GGML_OP_IM2COL:
289 {
290 const int n_embd_inp = hparams.n_embd_inp();
291 ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
292 op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
293 } break;
294 case GGML_OP_SCALE:
295 {
296 op_tensor = ggml_scale(ctx, w, 1.0f);
297 } break;
298 default:
299 GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
300 }
301
302 // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
303 GGML_ASSERT(w->buffer == nullptr);
304 w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
305 bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
306 ggml_backend_buffer_free(w->buffer);
307 w->buffer = nullptr;
308
309 return op_supported;
310}
311
312// lists of buffer types used for each layer
313using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
314
315// find the first buffer type in the list that can use the tensor
316static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
317 GGML_ASSERT(!buft_list.empty());
318 for (const auto & cur : buft_list) {
319 ggml_backend_dev_t cur_dev = cur.first;
320 ggml_backend_buffer_type_t cur_buft = cur.second;
321 if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
322 return cur_buft;
323 }
324 }
325
326 return nullptr;
327}
328
329// CPU: ACCEL -> GPU host -> CPU extra -> CPU
330static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
331 buft_list_t buft_list;
332
333 // add ACCEL buffer types
334 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
335 ggml_backend_dev_t dev = ggml_backend_dev_get(i);
336 if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
337 auto * buft = ggml_backend_dev_buffer_type(dev);
338 // skip
339 if (buft != ggml_backend_cpu_buffer_type()) {
340 buft_list.emplace_back(dev, buft);
341 }
342 }
343 }
344
345 // add a host buffer type
346 // storing the tensors in a host buffer is useful when the processing of large batches
347 // is offloaded to a GPU device, since it reduces the time spent on data transfers
348 // generally, this will be done using the first device in the list
349 // a better approach would be to handle this on a weight-by-weight basis using the offload_op
350 // function of the device to determine if it would benefit from being stored in a host buffer
351 if (!no_host) {
352 for (auto * dev : devices) {
353 ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
354 if (buft) {
355 buft_list.emplace_back(dev, buft);
356 break;
357 }
358 }
359 }
360
361 // add extra buffer types
362 if (use_extra_bufts) {
363 auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
364 if (cpu_dev == nullptr) {
365 throw std::runtime_error(format("%s: no CPU backend found", __func__));
366 }
367
368 auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
369 auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
370 ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
371 if (ggml_backend_dev_get_extra_bufts_fn) {
372 ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
373 while (extra_bufts && *extra_bufts) {
374 buft_list.emplace_back(cpu_dev, *extra_bufts);
375 ++extra_bufts;
376 }
377 }
378 }
379
380 // add the CPU buffer type
381 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
382 ggml_backend_dev_t dev = ggml_backend_dev_get(i);
383 if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
384 buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
385 }
386 }
387
388 return buft_list;
389}
390
391// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
392static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
393 buft_list_t buft_list;
394
395 // add the device split buffer type if requested and available
396 if (split_mode == LLAMA_SPLIT_MODE_ROW) {
397 ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
398 auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
399 ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
400 if (ggml_backend_split_buffer_type_fn) {
401 size_t dev_index = [&]() {
402 auto * reg = ggml_backend_dev_backend_reg(dev);
403 for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
404 if (ggml_backend_reg_dev_get(reg, i) == dev) {
405 return i;
406 }
407 }
408 throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
409 }();
410 auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
411 if (buft != nullptr) {
412 buft_list.emplace_back(dev, buft);
413 }
414 }
415 }
416
417 // add the device default buffer type
418 buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
419
420 // add the device extra buffer type (if any)
421 ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
422 auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
423 ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
424
425 if (ggml_backend_dev_get_extra_bufts_fn) {
426 ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
427 while (extra_bufts && *extra_bufts) {
428 buft_list.emplace_back(dev, *extra_bufts);
429 ++extra_bufts;
430 }
431 }
432
433 return buft_list;
434}
435
436struct llama_model::impl {
437 impl() = default;
438 ~impl() = default;
439
440 uint64_t n_elements = 0;
441
442 size_t n_bytes = 0;
443
444 std::string desc_str;
445
446 // model memory mapped files
447 llama_mmaps mappings;
448
449 // objects representing data potentially being locked in memory
450 llama_mlocks mlock_bufs;
451 llama_mlocks mlock_mmaps;
452
453 // contexts where the model tensors metadata is stored as well as the corresponding buffers:
454 std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
455
456 buft_list_t cpu_buft_list;
457 std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
458
459 struct layer_dev {
460 ggml_backend_dev_t dev;
461 buft_list_t * buft_list;
462 };
463
464 layer_dev dev_input = {};
465 layer_dev dev_output = {};
466 std::vector<layer_dev> dev_layer;
467
468 bool has_tensor_overrides;
469};
470
471llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
472 pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
473}
474
475llama_model::~llama_model() {
476 for (auto * lora : loras) {
477 delete lora;
478 }
479}
480
481void llama_model::load_stats(llama_model_loader & ml) {
482 pimpl->n_elements = ml.n_elements;
483 pimpl->n_bytes = ml.n_bytes;
484}
485
486void llama_model::load_arch(llama_model_loader & ml) {
487 arch = ml.get_arch();
488 if (arch == LLM_ARCH_UNKNOWN) {
489 throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
490 }
491}
492
493void llama_model::load_hparams(llama_model_loader & ml) {
494 const gguf_context * ctx = ml.meta.get();
495
496 // get metadata as string
497 for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
498 gguf_type type = gguf_get_kv_type(ctx, i);
499 if (type == GGUF_TYPE_ARRAY) {
500 continue;
501 }
502 const char * name = gguf_get_key(ctx, i);
503 const std::string value = gguf_kv_to_str(ctx, i);
504 gguf_kv.emplace(name, value);
505 }
506
507 // get general kv
508 ml.get_key(LLM_KV_GENERAL_NAME, name, false);
509
510 // everything past this point is not vocab-related
511 // for CLIP models, we only need to load tensors, no hparams
512 if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
513 return;
514 }
515
516 ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
517 ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
518 ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
519 ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
520 ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
521 ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
522 ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
523 ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
524
525 if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
526 ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd);
527 ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
528
529 ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
530 ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
531
532 ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
533 ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
534 }
535
536 GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
537 GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
538 if (hparams.n_expert > 0) {
539 GGML_ASSERT(hparams.n_expert_used > 0);
540 GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
541 if (hparams.n_expert_groups > 1) {
542 GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
543 GGML_ASSERT(hparams.n_group_used > 0);
544 GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
545 }
546 } else {
547 GGML_ASSERT(hparams.n_expert_used == 0);
548 GGML_ASSERT(hparams.n_expert_groups == 0);
549 }
550
551 std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
552 std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
553 std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
554 std::fill(
555 hparams.recurrent_layer_arr.begin(),
556 hparams.recurrent_layer_arr.end(),
557 llm_arch_is_recurrent(ml.get_arch()));
558
559 std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
560 std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
561
562 std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
563 std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
564 std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
565 std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
566 std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f);
567 std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
568
569 ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
570 ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
571
572 // n_head_kv is optional, default to n_head
573 hparams.n_head_kv_arr = hparams.n_head_arr;
574
575 ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
576
577 bool rope_finetuned = false;
578 ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
579 hparams.rope_finetuned = rope_finetuned;
580
581 hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
582 ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
583
584 // rope_freq_base (optional)
585 hparams.rope_freq_base_train = 10000.0f;
586 ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
587
588 std::string rope_scaling("linear");
589 ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
590 hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
591 GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
592
593 // TODO: Handle SWA metadata similarly when models start implementing it
594 // rope_freq_scale (inverse of the kv) is optional
595 float ropescale = 0.0f;
596 if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
597 // try the old key name
598 ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
599 }
600 hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
601
602 ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
603
604 // non-transformer models do not have attention heads
605 if (hparams.n_head() > 0) {
606 // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
607 // gpt-j n_rot = rotary_dim
608
609 hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
610 ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
611
612 hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
613 ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
614
615 // sanity check for n_rot (optional)
616 hparams.n_rot = hparams.n_embd_head_k;
617
618 ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
619
620 if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
621 if (hparams.n_rot != hparams.n_embd_head_k) {
622 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
623 }
624 }
625 } else {
626 hparams.n_rot = 0;
627 hparams.n_embd_head_k = 0;
628 hparams.n_embd_head_v = 0;
629 }
630
631 // for differentiating model types
632 uint32_t n_vocab = 0;
633 ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
634
635 // for classifier models
636 ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
637 if (!classifier_labels.empty()) {
638 hparams.n_cls_out = classifier_labels.size();
639 }
640
641 // arch-specific KVs
642 switch (arch) {
643 case LLM_ARCH_LLAMA:
644 case LLM_ARCH_LLAMA_EMBED:
645 {
646 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
647
648 if (hparams.n_expert == 8) {
649 switch (hparams.n_layer) {
650 case 32: type = LLM_TYPE_8x7B; break;
651 case 56: type = LLM_TYPE_8x22B; break;
652 default: type = LLM_TYPE_UNKNOWN;
653 }
654 } else {
655 switch (hparams.n_layer) {
656 case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
657 case 22: type = LLM_TYPE_1B; break;
658 case 26: type = LLM_TYPE_3B; break;
659 case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
660 case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
661 // granite uses a vocab with len 49152
662 case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
663 case 36: type = LLM_TYPE_8B; break; // granite
664 case 40: type = LLM_TYPE_13B; break;
665 case 48: type = LLM_TYPE_34B; break;
666 case 60: type = LLM_TYPE_30B; break;
667 case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
668 default: type = LLM_TYPE_UNKNOWN;
669 }
670 }
671 } break;
672 case LLM_ARCH_LLAMA4:
673 {
674 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
675 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
676 ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
677
678 const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
679 if (found_swa && hparams.n_swa == 0) {
680 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
681 hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
682 } else {
683 hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
684 hparams.n_swa = 8192;
685 hparams.n_attn_temp_floor_scale = 8192;
686 hparams.f_attn_temp_scale = 0.1f;
687 hparams.f_attn_temp_offset = 1.0f;
688 hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
689
690 hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
691 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
692 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
693 }
694
695 switch (hparams.n_expert) {
696 case 0: {
697 // MobileLLM (no MoE)
698 switch (hparams.n_embd) {
699 case 2048: type = LLM_TYPE_140M; break;
700 case 4096: type = LLM_TYPE_360M; break;
701 case 6144: type = LLM_TYPE_950M; break;
702 default: type = LLM_TYPE_UNKNOWN;
703 }
704 } break;
705 case 16: type = LLM_TYPE_17B_16E; break;
706 case 128: type = LLM_TYPE_17B_128E; break;
707 default: type = LLM_TYPE_UNKNOWN;
708 }
709
710 hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
711 } break;
712 case LLM_ARCH_ARCEE:
713 {
714 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
715
716 // Arcee uses the same structure as Llama
717 switch (hparams.n_layer) {
718 case 36: type = LLM_TYPE_4B; break;
719 default: type = LLM_TYPE_UNKNOWN;
720 }
721 } break;
722 case LLM_ARCH_AFMOE:
723 {
724 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
725 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
726 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
727 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
728 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
729 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
730 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
731 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
732
733 // Set up interleaved sliding window attention (ISWA)
734 // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
735 if (hparams.n_swa > 0) {
736 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
737 hparams.set_swa_pattern(4);
738
739 hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
740 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
741 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
742 } else {
743 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
744 }
745
746 // Default to sigmoid if not set
747 if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
748 hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
749 }
750
751 switch (hparams.n_layer) {
752 case 56: type = LLM_TYPE_6B; break;
753 case 32: type = LLM_TYPE_26B; break;
754 default: type = LLM_TYPE_UNKNOWN;
755 }
756 } break;
757 case LLM_ARCH_DECI:
758 {
759 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
760 switch (hparams.n_layer) {
761 case 32: type = LLM_TYPE_7B; break;
762 case 80: type = LLM_TYPE_70B; break;
763 case 162: type = LLM_TYPE_405B; break;
764 default: type = LLM_TYPE_UNKNOWN;
765 }
766 } break;
767 case LLM_ARCH_MINICPM:
768 {
769 // Backward-compatible defaults for older MiniCPM GGUFs
770 hparams.f_embedding_scale = 12.0f;
771 hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer));
772 hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
773
774 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
775
776 // Optional KV reads, override defaults if present in newer GGUF exports
777 ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
778 ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
779 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
780
781 // MiniCPM uses rope by default, unlike Granite which uses it as a switch
782 hparams.rope_finetuned = true;
783
784 switch (hparams.n_layer) {
785 case 52: type = LLM_TYPE_1B; break;
786 case 40: type = LLM_TYPE_2B; break;
787 default: type = LLM_TYPE_UNKNOWN;
788 }
789 } break;
790 case LLM_ARCH_MINICPM3:
791 {
792 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
793 ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
794 ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
795
796 switch (hparams.n_layer) {
797 case 62: type = LLM_TYPE_4B; break;
798 default: type = LLM_TYPE_UNKNOWN;
799 }
800 } break;
801 case LLM_ARCH_GROK:
802 {
803 // defaults for old GGUFs
804 hparams.yarn_beta_fast = 8.0f;
805 hparams.f_logit_scale = 0.5773502691896257f;
806 hparams.f_embedding_scale = 78.38367176906169f;
807 hparams.f_attn_out_scale = 0.08838834764831845f;
808 hparams.f_attn_logit_softcapping = 30.0f;
809 hparams.f_router_logit_softcapping = 30.0f;
810 // no final_logit_softcapping in grok-1
811 hparams.f_final_logit_softcapping = 0.0f;
812
813 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
814 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
815 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
816 ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
817 ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
818 ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
819 ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
820 ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
821
822 ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
823 ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
824 ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
825 ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
826 ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
827
828 switch (hparams.n_layer) {
829 case 64: type = LLM_TYPE_314B; break;
830 default: type = LLM_TYPE_UNKNOWN;
831 }
832 } break;
833 case LLM_ARCH_FALCON:
834 {
835 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
836
837 switch (hparams.n_layer) {
838 case 32: type = LLM_TYPE_7B; break;
839 case 60: type = LLM_TYPE_40B; break;
840 default: type = LLM_TYPE_UNKNOWN;
841 }
842 } break;
843 case LLM_ARCH_BAICHUAN:
844 {
845 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
846 switch (hparams.n_layer) {
847 case 32: type = LLM_TYPE_7B; break;
848 case 40: type = LLM_TYPE_13B; break;
849 default: type = LLM_TYPE_UNKNOWN;
850 }
851
852 if (type == LLM_TYPE_13B) {
853 // TODO: become GGUF KV parameter
854 hparams.f_max_alibi_bias = 8.0f;
855 }
856 } break;
857 case LLM_ARCH_STARCODER:
858 {
859 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
860 switch (hparams.n_layer) {
861 case 24: type = LLM_TYPE_1B; break;
862 case 36: type = LLM_TYPE_3B; break;
863 case 42: type = LLM_TYPE_7B; break;
864 case 40: type = LLM_TYPE_15B; break;
865 default: type = LLM_TYPE_UNKNOWN;
866 }
867 } break;
868 case LLM_ARCH_REFACT:
869 {
870 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
871 switch (hparams.n_layer) {
872 case 32: type = LLM_TYPE_1B; break;
873 default: type = LLM_TYPE_UNKNOWN;
874 }
875
876 // TODO: become GGUF KV parameter
877 hparams.f_max_alibi_bias = 8.0f;
878 } break;
879 case LLM_ARCH_BERT:
880 {
881 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
882 ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
883 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
884
885 switch (hparams.n_layer) {
886 case 3:
887 type = LLM_TYPE_17M; break; // bge-micro
888 case 6:
889 type = LLM_TYPE_22M; break; // MiniLM-L6
890 case 12:
891 switch (hparams.n_embd) {
892 case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
893 case 768: type = LLM_TYPE_109M; break; // bge-base
894 default: type = LLM_TYPE_UNKNOWN;
895 } break;
896 case 24:
897 type = LLM_TYPE_335M; break; // bge-large
898 default: type = LLM_TYPE_UNKNOWN;
899 }
900 } break;
901 case LLM_ARCH_MODERN_BERT:
902 {
903 const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
904 if (found_swa && hparams.n_swa > 0) {
905 uint32_t swa_period = 3;
906 hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
907
908 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
909 ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
910 hparams.set_swa_pattern(swa_period);
911 } else {
912 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
913 }
914
915 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
916 ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
917 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
918
919 switch (hparams.n_layer) {
920 case 12:
921 type = LLM_TYPE_47M; break; // granite-embedding-small
922 case 22:
923 type = LLM_TYPE_149M; break; // modern-bert-base
924 case 28:
925 type = LLM_TYPE_395M; break; // modern-bert-large
926 default: type = LLM_TYPE_UNKNOWN;
927 }
928 } break;
929 case LLM_ARCH_JINA_BERT_V2:
930 {
931 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
932 ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
933 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
934 hparams.f_max_alibi_bias = 8.0f;
935
936 switch (hparams.n_layer) {
937 case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
938 case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
939 default: type = LLM_TYPE_UNKNOWN;
940 }
941 } break;
942 case LLM_ARCH_JINA_BERT_V3:
943 {
944 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
945 ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
946 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
947
948 switch (hparams.n_layer) {
949 case 24:
950 type = LLM_TYPE_558M; break;
951 default: type = LLM_TYPE_UNKNOWN;
952 }
953 } break;
954 case LLM_ARCH_NOMIC_BERT:
955 case LLM_ARCH_NOMIC_BERT_MOE:
956 {
957 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
958 ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
959 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
960 ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
961
962 if (hparams.n_layer == 12 && hparams.n_embd == 768) {
963 if (arch == LLM_ARCH_NOMIC_BERT) {
964 type = LLM_TYPE_137M;
965 } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
966 type = LLM_TYPE_475M;
967 }
968 }
969 } break;
970 case LLM_ARCH_NEO_BERT:
971 {
972 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
973 ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
974 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
975
976 if (hparams.n_layer == 28) {
977 type = LLM_TYPE_250M;
978 }
979 } break;
980 case LLM_ARCH_BLOOM:
981 {
982 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
983
984 switch (hparams.n_layer) {
985 case 24: type = LLM_TYPE_1B; break;
986 case 30:
987 switch (hparams.n_embd) {
988 case 2560: type = LLM_TYPE_3B; break;
989 case 4096: type = LLM_TYPE_7B; break;
990 default: type = LLM_TYPE_UNKNOWN;
991 } break;
992 default: type = LLM_TYPE_UNKNOWN;
993 }
994
995 // TODO: become GGUF KV parameter
996 hparams.f_max_alibi_bias = 8.0f;
997 } break;
998 case LLM_ARCH_MPT:
999 {
1000 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1001 ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
1002 ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
1003
1004 switch (hparams.n_layer) {
1005 case 32: type = LLM_TYPE_7B; break;
1006 case 48: type = LLM_TYPE_30B; break;
1007 default: type = LLM_TYPE_UNKNOWN;
1008 }
1009 } break;
1010 case LLM_ARCH_STABLELM:
1011 {
1012 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1013
1014 switch (hparams.n_layer) {
1015 case 24: type = LLM_TYPE_1B; break;
1016 case 32: type = LLM_TYPE_3B; break;
1017 case 40: type = LLM_TYPE_12B; break;
1018 default: type = LLM_TYPE_UNKNOWN;
1019 }
1020 } break;
1021 case LLM_ARCH_QWEN:
1022 {
1023 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1024
1025 switch (hparams.n_layer) {
1026 case 32: type = LLM_TYPE_7B; break;
1027 case 40: type = LLM_TYPE_13B; break;
1028 default: type = LLM_TYPE_UNKNOWN;
1029 }
1030 } break;
1031 case LLM_ARCH_QWEN2VL:
1032 {
1033 ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1034 }
1035 // fall through
1036 case LLM_ARCH_QWEN2:
1037 {
1038 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
1039 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1040 switch (hparams.n_layer) {
1041 case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
1042 case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
1043 case 32: type = LLM_TYPE_7B; break;
1044 case 36: type = LLM_TYPE_3B; break;
1045 case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
1046 case 48: type = LLM_TYPE_14B; break;
1047 case 64: type = LLM_TYPE_32B; break;
1048 case 80: type = LLM_TYPE_70B; break;
1049 default: type = LLM_TYPE_UNKNOWN;
1050 }
1051 } break;
1052 case LLM_ARCH_DREAM:
1053 {
1054 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1055 // Dream models are primarily 7B with 28 layers
1056 switch (hparams.n_layer) {
1057 case 28:
1058 type = LLM_TYPE_7B;
1059 break;
1060 default:
1061 type = LLM_TYPE_UNKNOWN;
1062 }
1063 // Set non-causal attention for diffusion models
1064 hparams.causal_attn = false;
1065 }
1066 break;
1067 case LLM_ARCH_LLADA:
1068 {
1069 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1070 // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
1071 switch (hparams.n_layer) {
1072 case 32:
1073 type = LLM_TYPE_8B;
1074 break;
1075 default:
1076 type = LLM_TYPE_UNKNOWN;
1077 }
1078 // Set non-causal attention for diffusion models
1079 hparams.causal_attn = false;
1080 }
1081 break;
1082 case LLM_ARCH_LLADA_MOE:
1083 {
1084 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1085
1086 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1087 // diffusion language model uses non-causal attention
1088 hparams.causal_attn = false;
1089 switch (hparams.n_layer) {
1090 case 16: type = LLM_TYPE_A1_7B; break;
1091 default: type = LLM_TYPE_UNKNOWN;
1092 }
1093 } break;
1094 case LLM_ARCH_RND1:
1095 {
1096 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1097
1098 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1099 switch (hparams.n_layer) {
1100 case 48: type = LLM_TYPE_30B_A3B; break;
1101 default: type = LLM_TYPE_UNKNOWN;
1102 }
1103 // Set non-causal attention for diffusion models
1104 hparams.causal_attn = false;
1105 } break;
1106 case LLM_ARCH_QWEN2MOE:
1107 {
1108 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1109 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1110
1111 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1112 switch (hparams.n_layer) {
1113 case 24: type = LLM_TYPE_A2_7B; break;
1114 case 28: type = LLM_TYPE_57B_A14B; break;
1115 default: type = LLM_TYPE_UNKNOWN;
1116 }
1117 } break;
1118 case LLM_ARCH_QWEN3:
1119 {
1120 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
1121 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1122 switch (hparams.n_layer) {
1123 case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
1124 case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1125 case 40: type = LLM_TYPE_14B; break;
1126 case 64: type = LLM_TYPE_32B; break;
1127 default: type = LLM_TYPE_UNKNOWN;
1128 }
1129 } break;
1130 case LLM_ARCH_MAINCODER:
1131 {
1132 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1133 switch (hparams.n_layer) {
1134 case 32: type = LLM_TYPE_1B; break;
1135 default: type = LLM_TYPE_UNKNOWN;
1136 }
1137 } break;
1138 case LLM_ARCH_QWEN3VL:
1139 {
1140 ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
1141 ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1142 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1143 switch (hparams.n_layer) {
1144 case 28: type = LLM_TYPE_1_7B; break;
1145 case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1146 case 64: type = LLM_TYPE_32B; break;
1147 default: type = LLM_TYPE_UNKNOWN;
1148 }
1149 } break;
1150 case LLM_ARCH_QWEN3MOE:
1151 {
1152 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1153
1154 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1155 switch (hparams.n_layer) {
1156 case 48: type = LLM_TYPE_30B_A3B; break;
1157 case 94: type = LLM_TYPE_235B_A22B; break;
1158 default: type = LLM_TYPE_UNKNOWN;
1159 }
1160 } break;
1161 case LLM_ARCH_QWEN3VLMOE:
1162 {
1163 ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
1164 ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1165 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1166 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1167 switch (hparams.n_layer) {
1168 case 48: type = LLM_TYPE_30B_A3B; break;
1169 case 94: type = LLM_TYPE_235B_A22B; break;
1170 default: type = LLM_TYPE_UNKNOWN;
1171 }
1172 } break;
1173 case LLM_ARCH_PHI2:
1174 {
1175 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1176
1177 switch (hparams.n_layer) {
1178 case 24: type = LLM_TYPE_1B; break;
1179 case 32: type = LLM_TYPE_3B; break;
1180 default: type = LLM_TYPE_UNKNOWN;
1181 }
1182 } break;
1183 case LLM_ARCH_PHI3:
1184 {
1185 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1186
1187 switch (hparams.n_layer) {
1188 case 24: type = LLM_TYPE_1B; break;
1189 case 32: type = LLM_TYPE_3B; break;
1190 case 40: type = LLM_TYPE_14B; break;
1191 default: type = LLM_TYPE_UNKNOWN;
1192 }
1193
1194 const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1195
1196 if (found_swa && hparams.n_swa > 0) {
1197 LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
1198 __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
1199
1200 // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
1201 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1202
1203 hparams.n_swa = 0;
1204 hparams.set_swa_pattern(1);
1205 }
1206 } break;
1207 case LLM_ARCH_PHIMOE:
1208 {
1209 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1210
1211 switch (hparams.n_layer) {
1212 case 32: type = LLM_TYPE_16x3_8B; break;
1213 default: type = LLM_TYPE_UNKNOWN;
1214 }
1215 } break;
1216 case LLM_ARCH_PLAMO:
1217 {
1218 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1219
1220 switch (hparams.n_layer) {
1221 case 40: type = LLM_TYPE_13B; break;
1222 default: type = LLM_TYPE_UNKNOWN;
1223 }
1224 } break;
1225 case LLM_ARCH_PLAMO2:
1226 {
1227 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1228
1229 // Load Mamba SSM parameters
1230 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1231 ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1232 ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1233 ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1234 ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1235
1236 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1237 hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1238 }
1239
1240 switch (hparams.n_layer) {
1241 case 16: type = LLM_TYPE_1B; break;
1242 case 32:
1243 if (hparams.n_embd == 2048) {
1244 type = LLM_TYPE_2B;
1245 } else if (hparams.n_embd == 4096) {
1246 type = LLM_TYPE_8B;
1247 }
1248 break;
1249 default: type = LLM_TYPE_UNKNOWN;
1250 }
1251
1252 // Load attention parameters
1253 ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
1254 ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
1255 } break;
1256 case LLM_ARCH_PLAMO3:
1257 {
1258 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1259 const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1260 if (found_swa && hparams.n_swa > 0) {
1261 uint32_t swa_period = 8;
1262 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1263 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
1264 ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1265 hparams.set_swa_pattern(swa_period);
1266 } else {
1267 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1268 }
1269
1270 switch (hparams.n_layer) {
1271 case 24: type = LLM_TYPE_2B; break;
1272 default: type = LLM_TYPE_UNKNOWN;
1273 }
1274 } break;
1275 case LLM_ARCH_GPT2:
1276 {
1277 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1278 switch (hparams.n_layer) {
1279 case 12: type = LLM_TYPE_SMALL; break;
1280 case 24: type = LLM_TYPE_MEDIUM; break;
1281 case 36: type = LLM_TYPE_LARGE; break;
1282 case 48: type = LLM_TYPE_XL; break;
1283 default: type = LLM_TYPE_UNKNOWN;
1284 }
1285 } break;
1286 case LLM_ARCH_CODESHELL:
1287 {
1288 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1289 switch (hparams.n_layer) {
1290 case 42: type = LLM_TYPE_7B; break;
1291 default: type = LLM_TYPE_UNKNOWN;
1292 }
1293 } break;
1294 case LLM_ARCH_ORION:
1295 {
1296 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1297
1298 switch (hparams.n_layer) {
1299 case 40: type = LLM_TYPE_14B; break;
1300 default: type = LLM_TYPE_UNKNOWN;
1301 }
1302 } break;
1303 case LLM_ARCH_INTERNLM2:
1304 {
1305 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1306 switch (hparams.n_layer) {
1307 case 32: type = LLM_TYPE_7B; break;
1308 case 48: type = LLM_TYPE_20B; break;
1309 default: type = LLM_TYPE_UNKNOWN;
1310 }
1311 } break;
1312 case LLM_ARCH_GEMMA:
1313 {
1314 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1315
1316 switch (hparams.n_layer) {
1317 case 18: type = LLM_TYPE_2B; break;
1318 case 28: type = LLM_TYPE_7B; break;
1319 default: type = LLM_TYPE_UNKNOWN;
1320 }
1321 } break;
1322 case LLM_ARCH_GEMMA2:
1323 {
1324 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1325 hparams.n_swa = 4096; // default value of gemma 2
1326 hparams.set_swa_pattern(2);
1327 hparams.attn_soft_cap = true;
1328 hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1329 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1330
1331 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1332 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1333 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1334 ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
1335 ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
1336
1337 switch (hparams.n_layer) {
1338 case 26: type = LLM_TYPE_2B; break;
1339 case 42: type = LLM_TYPE_9B; break;
1340 case 46: type = LLM_TYPE_27B; break;
1341 default: type = LLM_TYPE_UNKNOWN;
1342 }
1343
1344 // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
1345 hparams.f_attention_scale = type == LLM_TYPE_27B
1346 ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1347 : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1348 } break;
1349 case LLM_ARCH_GEMMA3:
1350 {
1351 const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1352 if (found_swa && hparams.n_swa > 0) {
1353 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1354 hparams.set_swa_pattern(6);
1355
1356 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1357 } else {
1358 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1359 }
1360
1361 hparams.f_final_logit_softcapping = 0.0f;
1362 ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
1363 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1364
1365 switch (hparams.n_layer) {
1366 case 18: type = LLM_TYPE_270M; break;
1367 case 26: type = LLM_TYPE_1B; break;
1368 case 32: type = LLM_TYPE_8B; break; // Rnj-1
1369 case 34: type = LLM_TYPE_4B; break;
1370 case 48: type = LLM_TYPE_12B; break;
1371 case 62: type = LLM_TYPE_27B; break;
1372 default: type = LLM_TYPE_UNKNOWN;
1373 }
1374
1375 // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
1376 hparams.f_attention_scale = type == LLM_TYPE_27B
1377 ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1378 : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1379 } break;
1380 case LLM_ARCH_GEMMA3N:
1381 {
1382 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1383 hparams.set_swa_pattern(5);
1384
1385 hparams.n_layer_kv_from_start = 20;
1386 hparams.f_attention_scale = 1.0f;
1387
1388 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1389 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1390 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1391
1392 switch (hparams.n_layer) {
1393 case 30: type = LLM_TYPE_E2B; break;
1394 case 35: type = LLM_TYPE_E4B; break;
1395 default: type = LLM_TYPE_UNKNOWN;
1396 }
1397 } break;
1398 case LLM_ARCH_GEMMA_EMBEDDING:
1399 {
1400 hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
1401 hparams.set_swa_pattern(6);
1402
1403 hparams.causal_attn = false; // embeddings do not use causal attention
1404
1405 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1406 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1407 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1408 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1409
1410 //applied only if model converted with --sentence-transformers-dense-modules
1411 ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
1412 ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
1413 ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
1414 ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
1415
1416 GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
1417 GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
1418
1419 switch (hparams.n_layer) {
1420 case 24: type = LLM_TYPE_0_3B; break;
1421 default: type = LLM_TYPE_UNKNOWN;
1422 }
1423 hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1424
1425 } break;
1426 case LLM_ARCH_STARCODER2:
1427 {
1428 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1429 switch (hparams.n_layer) {
1430 case 30: type = LLM_TYPE_3B; break;
1431 case 32: type = LLM_TYPE_7B; break;
1432 case 40: type = LLM_TYPE_15B; break;
1433 case 52: type = LLM_TYPE_20B; break; // granite
1434 case 88: type = LLM_TYPE_34B; break; // granite
1435 default: type = LLM_TYPE_UNKNOWN;
1436 }
1437 } break;
1438 case LLM_ARCH_MAMBA:
1439 {
1440 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1441 ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1442 ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1443 ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1444 ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
1445
1446 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1447
1448 switch (hparams.n_layer) {
1449 case 24:
1450 switch (hparams.n_embd) {
1451 case 768: type = LLM_TYPE_SMALL; break;
1452 default: type = LLM_TYPE_UNKNOWN;
1453 } break;
1454 case 48:
1455 switch (hparams.n_embd) {
1456 case 1024: type = LLM_TYPE_MEDIUM; break;
1457 case 1536: type = LLM_TYPE_LARGE; break;
1458 case 2048: type = LLM_TYPE_XL; break;
1459 default: type = LLM_TYPE_UNKNOWN;
1460 } break;
1461 case 64:
1462 switch (hparams.n_embd) {
1463 case 2560: type = LLM_TYPE_3B; break;
1464 default: type = LLM_TYPE_UNKNOWN;
1465 } break;
1466 default: type = LLM_TYPE_UNKNOWN;
1467 }
1468 } break;
1469 case LLM_ARCH_MAMBA2:
1470 {
1471 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1472 ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1473 ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1474 ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1475 ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1476
1477 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1478
1479 switch (hparams.n_layer) {
1480 case 24:
1481 switch (hparams.n_embd) {
1482 case 768: type = LLM_TYPE_SMALL; break;
1483 default: type = LLM_TYPE_UNKNOWN;
1484 } break;
1485 case 48:
1486 switch (hparams.n_embd) {
1487 case 1024: type = LLM_TYPE_MEDIUM; break;
1488 case 1536: type = LLM_TYPE_LARGE; break;
1489 case 2048: type = LLM_TYPE_XL; break;
1490 default: type = LLM_TYPE_UNKNOWN;
1491 } break;
1492 case 64:
1493 switch (hparams.n_embd) {
1494 case 2560: type = LLM_TYPE_3B; break;
1495 case 4096: type = LLM_TYPE_7B; break;
1496 default: type = LLM_TYPE_UNKNOWN;
1497 } break;
1498 default: type = LLM_TYPE_UNKNOWN;
1499 }
1500 } break;
1501 case LLM_ARCH_JAMBA:
1502 {
1503 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1504 ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1505 ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1506 ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1507
1508 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1509
1510 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1511 hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1512 }
1513
1514 switch (hparams.n_layer) {
1515 // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
1516 case 12: // 900M 8x???M
1517 case 32: // 51B 16x?B
1518 default: type = LLM_TYPE_UNKNOWN;
1519 }
1520 } break;
1521 case LLM_ARCH_XVERSE:
1522 {
1523 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1524 switch (hparams.n_layer) {
1525 case 32: type = LLM_TYPE_7B; break;
1526 case 40: type = LLM_TYPE_13B; break;
1527 case 80: type = LLM_TYPE_65B; break;
1528 default: type = LLM_TYPE_UNKNOWN;
1529 }
1530 } break;
1531 case LLM_ARCH_COMMAND_R:
1532 {
1533 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
1534 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1535 switch (hparams.n_layer) {
1536 case 40: type = LLM_TYPE_35B; break;
1537 default: type = LLM_TYPE_UNKNOWN;
1538 }
1539 } break;
1540 case LLM_ARCH_COHERE2:
1541 {
1542 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1543 hparams.set_swa_pattern(4);
1544 hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1545 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1546
1547 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1548 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1549 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
1550 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1551 switch (hparams.n_layer) {
1552 case 32: type = LLM_TYPE_8B; break;
1553 default: type = LLM_TYPE_UNKNOWN;
1554 }
1555 } break;
1556 case LLM_ARCH_DBRX:
1557 {
1558 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1559 ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
1560
1561 switch (hparams.n_layer) {
1562 case 40: type = LLM_TYPE_16x12B; break;
1563 default: type = LLM_TYPE_UNKNOWN;
1564 }
1565 } break;
1566 case LLM_ARCH_OLMO:
1567 {
1568 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1569 ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
1570
1571 switch (hparams.n_layer) {
1572 case 22: type = LLM_TYPE_1B; break;
1573 case 32: type = LLM_TYPE_7B; break;
1574 case 80: type = LLM_TYPE_70B; break;
1575 default: type = LLM_TYPE_UNKNOWN;
1576 }
1577 } break;
1578 case LLM_ARCH_OLMO2:
1579 {
1580 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1581
1582 const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1583 if (found_swa && hparams.n_swa > 0) {
1584 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1585 hparams.set_swa_pattern(4);
1586
1587 hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1588 hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
1589 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1590 } else {
1591 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1592 }
1593
1594 switch (hparams.n_layer) {
1595 case 16: type = LLM_TYPE_1B; break;
1596 case 32: type = LLM_TYPE_7B; break;
1597 case 40: type = LLM_TYPE_13B; break;
1598 case 64: type = LLM_TYPE_32B; break;
1599 default: type = LLM_TYPE_UNKNOWN;
1600 }
1601 } break;
1602 case LLM_ARCH_SEED_OSS:
1603 {
1604 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1605 switch (hparams.n_layer) {
1606 case 64: type = LLM_TYPE_36B; break;
1607 default: type = LLM_TYPE_UNKNOWN;
1608 }
1609 } break;
1610 case LLM_ARCH_OLMOE:
1611 {
1612 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1613 switch (hparams.n_layer) {
1614 case 16: type = LLM_TYPE_A1_7B; break;
1615 default: type = LLM_TYPE_UNKNOWN;
1616 }
1617 } break;
1618 case LLM_ARCH_OPENELM:
1619 {
1620 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1621
1622 switch (hparams.n_layer) {
1623 case 16: type = LLM_TYPE_270M; break;
1624 case 20: type = LLM_TYPE_450M; break;
1625 case 28: type = LLM_TYPE_1B; break;
1626 case 36: type = LLM_TYPE_3B; break;
1627 default: type = LLM_TYPE_UNKNOWN;
1628 }
1629 } break;
1630 case LLM_ARCH_GPTNEOX:
1631 {
1632 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1633 ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
1634 switch (hparams.n_layer) {
1635 case 6:
1636 switch (hparams.n_ff()) {
1637 case 512: type = LLM_TYPE_14M; break;
1638 case 2048: type = LLM_TYPE_70M; break;
1639 default: type = LLM_TYPE_UNKNOWN;
1640 } break;
1641 case 12:
1642 switch (hparams.n_ff()) {
1643 case 3072: type = LLM_TYPE_160M; break;
1644 default: type = LLM_TYPE_UNKNOWN;
1645 } break;
1646 case 16:
1647 switch (hparams.n_ff()) {
1648 case 8192: type = LLM_TYPE_1B; break;
1649 default: type = LLM_TYPE_UNKNOWN;
1650 } break;
1651 case 24:
1652 switch (hparams.n_ff()) {
1653 case 4096: type = LLM_TYPE_410M; break;
1654 case 8192: type = LLM_TYPE_1_4B; break;
1655 default: type = LLM_TYPE_UNKNOWN;
1656 } break;
1657 case 32:
1658 switch (hparams.n_ff()) {
1659 case 10240: type = LLM_TYPE_2_8B; break;
1660 case 16384: type = LLM_TYPE_6_9B; break;
1661 default: type = LLM_TYPE_UNKNOWN;
1662 } break;
1663 case 36:
1664 switch (hparams.n_ff()) {
1665 case 20480: type = LLM_TYPE_12B; break;
1666 default: type = LLM_TYPE_UNKNOWN;
1667 } break;
1668 case 44:
1669 switch (hparams.n_ff()) {
1670 case 24576: type = LLM_TYPE_20B; break;
1671 default: type = LLM_TYPE_UNKNOWN;
1672 } break;
1673 default: type = LLM_TYPE_UNKNOWN;
1674 }
1675 } break;
1676 case LLM_ARCH_ARCTIC:
1677 {
1678 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1679
1680 if (hparams.n_expert == 128) {
1681 switch (hparams.n_layer) {
1682 case 35: type = LLM_TYPE_10B_128x3_66B; break;
1683 default: type = LLM_TYPE_UNKNOWN;
1684 }
1685 } else {
1686 type = LLM_TYPE_UNKNOWN;
1687 }
1688 } break;
1689 case LLM_ARCH_DEEPSEEK:
1690 {
1691 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1692 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1693 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1694 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1695 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1696
1697 switch (hparams.n_ff_exp) {
1698 case 1408: type = LLM_TYPE_16B; break;
1699 case 1792: type = LLM_TYPE_20B; break;
1700 default: type = LLM_TYPE_UNKNOWN;
1701 }
1702 } break;
1703 case LLM_ARCH_DEEPSEEK2:
1704 {
1705 // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
1706 const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
1707
1708 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1709 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1710 if (!is_lite) {
1711 ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1712 }
1713 ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1714 ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
1715 ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
1716 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1717 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1718 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1719 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1720 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1721 if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1722 // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
1723 // that have no expert_gating_func model parameter set
1724 if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
1725 // GLM 4.7 Lite
1726 hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1727 } else {
1728 hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1729 }
1730 }
1731
1732 if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
1733 // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
1734 // cancel the factor from the convert script
1735 hparams.rope_yarn_log_mul /= 0.1f;
1736 }
1737
1738 // (optional) temperature tuning - used by mistral-large
1739 ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
1740 ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
1741
1742 hparams.f_attn_temp_offset = 0.0f;
1743
1744 switch (hparams.n_layer) {
1745 case 27: type = LLM_TYPE_16B; break;
1746 case 47: type = LLM_TYPE_30B_A3B; break;
1747 case 60: type = LLM_TYPE_236B; break;
1748 case 61: type = LLM_TYPE_671B; break;
1749 default: type = LLM_TYPE_UNKNOWN;
1750 }
1751 } break;
1752 case LLM_ARCH_PLM:
1753 {
1754 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1755 ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1756 switch (hparams.n_layer) {
1757 case 32: type = LLM_TYPE_1_8B; break;
1758 default: type = LLM_TYPE_UNKNOWN;
1759 }
1760 } break;
1761 case LLM_ARCH_CHATGLM:
1762 {
1763 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1764 switch (hparams.n_layer) {
1765 case 28: {
1766 if (hparams.n_head(0) == 16) {
1767 type = LLM_TYPE_1_5B;
1768 } else {
1769 type = LLM_TYPE_6B;
1770 }
1771 } break;
1772 case 40: {
1773 if (hparams.n_head(0) == 24) {
1774 type = LLM_TYPE_4B;
1775 } else {
1776 type = LLM_TYPE_9B;
1777 }
1778 } break;
1779 default: type = LLM_TYPE_UNKNOWN;
1780 }
1781 } break;
1782 case LLM_ARCH_GLM4:
1783 {
1784 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1785 ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1786 switch (hparams.n_layer) {
1787 case 40: type = LLM_TYPE_9B; break;
1788 case 61: type = LLM_TYPE_32B; break;
1789 default: type = LLM_TYPE_UNKNOWN;
1790 }
1791 } break;
1792 case LLM_ARCH_GLM4_MOE:
1793 {
1794 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1795 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1796 ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1797
1798 // MoE parameters
1799 ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
1800 ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
1801 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1802 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1803 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1804 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1805
1806 // Expert gating function (GLM-4.5 uses sigmoid)
1807 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1808 if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1809 hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1810 }
1811
1812 // NextN/MTP parameters
1813 ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1814
1815 // TODO: when MTP is implemented, this should probably be updated if needed
1816 hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1817
1818 switch (hparams.n_layer) {
1819 case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1820 case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
1821 case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1822 default: type = LLM_TYPE_UNKNOWN;
1823 }
1824 } break;
1825 case LLM_ARCH_BITNET:
1826 {
1827 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1828
1829 switch (hparams.n_layer) {
1830 case 26: type = LLM_TYPE_3B; break;
1831 default: type = LLM_TYPE_UNKNOWN;
1832 }
1833 } break;
1834 case LLM_ARCH_T5:
1835 {
1836 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1837 ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
1838
1839 uint32_t dec_start_token_id;
1840 if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
1841 hparams.dec_start_token_id = dec_start_token_id;
1842 }
1843
1844 hparams.dec_n_layer = hparams.n_layer;
1845 ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
1846
1847 switch (hparams.n_layer) {
1848 case 6: type = LLM_TYPE_60M; break; // t5-small
1849 case 8: type = LLM_TYPE_80M; break; // flan-t5-small
1850 case 12:
1851 switch (hparams.n_ff()) {
1852 case 3072: type = LLM_TYPE_220M; break; // t5-base
1853 case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
1854 default: type = LLM_TYPE_UNKNOWN;
1855 } break;
1856 case 24:
1857 switch (hparams.n_ff()) {
1858 case 4096: type = LLM_TYPE_770M; break; // t5-large
1859 case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
1860 case 16384: type = LLM_TYPE_3B; break; // t5-3b
1861 case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
1862 case 65536: type = LLM_TYPE_11B; break; // t5-11b
1863 case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
1864 default: type = LLM_TYPE_UNKNOWN;
1865 } break;
1866 default: type = LLM_TYPE_UNKNOWN;
1867 }
1868 } break;
1869 case LLM_ARCH_T5ENCODER:
1870 {
1871 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1872 ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
1873 type = LLM_TYPE_UNKNOWN;
1874 } break;
1875 case LLM_ARCH_JAIS:
1876 {
1877 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1878 ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
1879
1880 switch (hparams.n_layer) {
1881 case 24: type = LLM_TYPE_1_3B; break;
1882 case 40: type = LLM_TYPE_13B; break;
1883 /* TODO: add variants */
1884 default: type = LLM_TYPE_UNKNOWN;
1885 }
1886 } break;
1887 case LLM_ARCH_NEMOTRON:
1888 {
1889 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1890 switch (hparams.n_layer) {
1891 case 32: type = LLM_TYPE_4B; break;
1892 default: type = LLM_TYPE_UNKNOWN;
1893 }
1894 } break;
1895 case LLM_ARCH_NEMOTRON_H:
1896 case LLM_ARCH_NEMOTRON_H_MOE:
1897 {
1898 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1899 ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1900 ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1901 ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1902 ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1903
1904 // A layer is recurrent IFF the n_head_kv value is set to 0 and
1905 // the n_ff value is set to 0
1906 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1907 hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
1908 }
1909
1910 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1911
1912 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1913 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1914 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
1915 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1916 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1917
1918 switch (hparams.n_layer) {
1919 case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
1920 case 56: type = LLM_TYPE_9B; break;
1921 default: type = LLM_TYPE_UNKNOWN;
1922 }
1923 } break;
1924 case LLM_ARCH_EXAONE:
1925 {
1926 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1927
1928 switch (hparams.n_layer) {
1929 case 32: type = LLM_TYPE_8B; break;
1930 default: type = LLM_TYPE_UNKNOWN;
1931 }
1932 } break;
1933 case LLM_ARCH_EXAONE4:
1934 {
1935 if (hparams.n_layer == 64) { // 32B
1936 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1937 hparams.n_swa = 4096;
1938 hparams.set_swa_pattern(4);
1939
1940 hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1941 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1942 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1943 }
1944
1945 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1946 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1947
1948 switch (hparams.n_layer) {
1949 case 30: type = LLM_TYPE_1_2B; break;
1950 case 64: type = LLM_TYPE_32B; break;
1951 default: type = LLM_TYPE_UNKNOWN;
1952 }
1953 } break;
1954 case LLM_ARCH_EXAONE_MOE:
1955 {
1956 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1957 hparams.n_swa = 128;
1958 hparams.set_swa_pattern(4);
1959 hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1960 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1961
1962 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1963 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1964 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1965 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
1966 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1967 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1968 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
1969 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1970 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1971 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1972
1973 ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1974
1975 switch (hparams.n_layer) {
1976 case 32: type = LLM_TYPE_30B_A3B; break;
1977 case 48:
1978 case 49: type = LLM_TYPE_235B_A22B; break;
1979 default: type = LLM_TYPE_UNKNOWN;
1980 }
1981 } break;
1982 case LLM_ARCH_RWKV6:
1983 case LLM_ARCH_RWKV6QWEN2:
1984 {
1985 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
1986 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
1987 ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
1988 ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
1989 ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
1990 ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
1991 ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
1992
1993 switch (hparams.n_layer) {
1994 case 24: type = LLM_TYPE_1_6B; break;
1995 case 32:
1996 switch (hparams.n_embd) {
1997 case 2560: type = LLM_TYPE_3B; break;
1998 case 4096: type = LLM_TYPE_7B; break;
1999 default: type = LLM_TYPE_UNKNOWN;
2000 } break;
2001 case 61: type = LLM_TYPE_14B; break;
2002 case 64: type = LLM_TYPE_32B; break;
2003 default: type = LLM_TYPE_UNKNOWN;
2004 }
2005 } break;
2006 case LLM_ARCH_RWKV7:
2007 case LLM_ARCH_ARWKV7:
2008 {
2009 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
2010 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
2011 ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
2012 ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
2013 ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
2014 ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
2015 ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
2016 ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
2017
2018 switch (hparams.n_layer) {
2019 case 12:
2020 switch (hparams.n_embd) {
2021 case 768: type = LLM_TYPE_190M; break;
2022 default: type = LLM_TYPE_UNKNOWN;
2023 } break;
2024 case 24:
2025 switch (hparams.n_embd) {
2026 case 1024: type = LLM_TYPE_450M; break;
2027 case 2048: type = LLM_TYPE_1_5B; break;
2028 default: type = LLM_TYPE_UNKNOWN;
2029 } break;
2030 case 28:
2031 switch (hparams.n_embd) {
2032 case 1536: type = LLM_TYPE_1_5B; break;
2033 case 3584: type = LLM_TYPE_7B; break;
2034 default: type = LLM_TYPE_UNKNOWN;
2035 } break;
2036 case 32:
2037 switch (hparams.n_embd) {
2038 case 2560: type = LLM_TYPE_2_9B; break;
2039 case 4096: type = LLM_TYPE_7B; break;
2040 default: type = LLM_TYPE_UNKNOWN;
2041 } break;
2042 case 61:
2043 switch (hparams.n_embd) {
2044 case 4096: type = LLM_TYPE_14B; break;
2045 default: type = LLM_TYPE_UNKNOWN;
2046 } break;
2047 default: type = LLM_TYPE_UNKNOWN;
2048 }
2049 } break;
2050 case LLM_ARCH_GRANITE:
2051 case LLM_ARCH_GRANITE_MOE:
2052 {
2053 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2054 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
2055 ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
2056 ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
2057 ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
2058
2059 // Granite uses rope_finetuned as a switch for rope, so default to true
2060 bool rope_finetuned = true;
2061 ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
2062 hparams.rope_finetuned = rope_finetuned;
2063
2064 switch (hparams.n_layer) {
2065 case 32: type = LLM_TYPE_3B; break;
2066 case 40: type = LLM_TYPE_3B; break;
2067 // Add additional layer/vocab/etc checks here for other model sizes
2068 default: type = LLM_TYPE_UNKNOWN;
2069 }
2070
2071 // For Granite MoE Shared
2072 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
2073 } break;
2074 case LLM_ARCH_GRANITE_HYBRID:
2075 {
2076 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2077 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
2078 ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
2079 ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
2080 ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
2081
2082 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2083 ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
2084 ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
2085 ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2086 ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
2087
2088 // Granite uses rope_finetuned as a switch for rope, so default to true
2089 bool rope_finetuned = true;
2090 ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
2091 hparams.rope_finetuned = rope_finetuned;
2092
2093 // A layer is recurrent IFF the n_head_kv value is set to 0
2094 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2095 hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
2096 }
2097
2098 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2099
2100 switch (hparams.n_embd) {
2101 case 768: type = LLM_TYPE_350M; break;
2102 case 1536: type = (hparams.n_embd == 2048 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
2103 case 2048: case 2560: type = LLM_TYPE_3B; break;
2104 case 4096: type = LLM_TYPE_32B; break;
2105 default: type = LLM_TYPE_UNKNOWN;
2106 }
2107
2108 // For Granite MoE Shared
2109 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
2110 } break;
2111 case LLM_ARCH_CHAMELEON:
2112 {
2113 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2114 hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
2115 ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
2116
2117 switch (hparams.n_layer) {
2118 case 32: type = LLM_TYPE_7B; break;
2119 case 48: type = LLM_TYPE_34B; break;
2120 default: type = LLM_TYPE_UNKNOWN;
2121 }
2122 } break;
2123 case LLM_ARCH_WAVTOKENIZER_DEC:
2124 {
2125 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2126 ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
2127 ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
2128 ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
2129 } break;
2130 case LLM_ARCH_BAILINGMOE:
2131 {
2132 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2133 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2134 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2135 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
2136 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
2137 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
2138
2139 switch (hparams.n_layer) {
2140 case 28: type = LLM_TYPE_16B; break;
2141 case 88: type = LLM_TYPE_290B; break;
2142 default: type = LLM_TYPE_UNKNOWN;
2143 }
2144 } break;
2145 case LLM_ARCH_BAILINGMOE2:
2146 {
2147 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2148 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2149 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2150 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
2151 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
2152 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
2153 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
2154 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
2155 ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
2156
2157 // TODO: when MTP is implemented, this should probably be updated if needed
2158 hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
2159
2160 switch (hparams.n_layer) {
2161 case 20: type = LLM_TYPE_16B_A1B; break;
2162 case 21: type = LLM_TYPE_16B_A1B; break;
2163 case 32: type = LLM_TYPE_100B_A6B; break;
2164 case 33: type = LLM_TYPE_100B_A6B; break;
2165 default: type = LLM_TYPE_UNKNOWN;
2166 }
2167 } break;
2168 case LLM_ARCH_DOTS1:
2169 {
2170 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2171 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2172 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2173 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
2174 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
2175 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
2176 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
2177 switch (hparams.n_layer) {
2178 case 62: type = LLM_TYPE_142B; break;
2179 default: type = LLM_TYPE_UNKNOWN;
2180 }
2181 } break;
2182 case LLM_ARCH_ERNIE4_5:
2183 case LLM_ARCH_ERNIE4_5_MOE:
2184 {
2185 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2186 if (arch == LLM_ARCH_ERNIE4_5_MOE) {
2187 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2188 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2189 ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
2190 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2191 }
2192
2193 switch (hparams.n_layer) {
2194 case 18: type = LLM_TYPE_0_3B; break;
2195 case 28: type = LLM_TYPE_21B_A3B; break;
2196 case 54: type = LLM_TYPE_300B_A47B; break;
2197 default: type = LLM_TYPE_UNKNOWN;
2198 }
2199 } break;
2200 case LLM_ARCH_FALCON_H1:
2201 {
2202 // Common parameters
2203 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2204
2205 // SSM parameters
2206 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2207 ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
2208 ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
2209 ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2210 ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
2211
2212 std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
2213
2214 switch (hparams.n_layer) {
2215 case 36:
2216 type = LLM_TYPE_0_5B; break;
2217 case 24:
2218 type = LLM_TYPE_1_5B; break;
2219 case 66:
2220 type = LLM_TYPE_1B; break;
2221 case 32:
2222 type = LLM_TYPE_3B; break;
2223 case 44:
2224 type = LLM_TYPE_7B; break;
2225 case 72:
2226 type = LLM_TYPE_34B; break;
2227 default:
2228 type = LLM_TYPE_UNKNOWN;
2229 }
2230 } break;
2231 case LLM_ARCH_HUNYUAN_MOE:
2232 {
2233 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2234 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2235 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
2236
2237 switch (hparams.n_layer) {
2238 case 32: type = LLM_TYPE_A13B; break;
2239 default: type = LLM_TYPE_UNKNOWN;
2240 }
2241 } break;
2242 case LLM_ARCH_HUNYUAN_DENSE:
2243 {
2244 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2245
2246 switch (hparams.n_embd) {
2247 case 1024: type = LLM_TYPE_0_5B; break;
2248 case 2048: type = LLM_TYPE_1_8B; break;
2249 case 3072: type = LLM_TYPE_4B; break;
2250 case 4096: type = LLM_TYPE_7B; break;
2251 default: type = LLM_TYPE_UNKNOWN;
2252 }
2253 } break;
2254 case LLM_ARCH_SMOLLM3:
2255 {
2256 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2257 hparams.n_no_rope_layer_step = 4;
2258
2259 switch (hparams.n_layer) {
2260 case 36: type = LLM_TYPE_3B; break;
2261 default: type = LLM_TYPE_UNKNOWN;
2262 }
2263 } break;
2264 case LLM_ARCH_OPENAI_MOE:
2265 {
2266 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2267 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2268 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
2269
2270 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2271 hparams.set_swa_pattern(2);
2272
2273 hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2274 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2275 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2276
2277 switch (hparams.n_layer) {
2278 case 24: type = LLM_TYPE_20B; break;
2279 case 36: type = LLM_TYPE_120B; break;
2280 default: type = LLM_TYPE_UNKNOWN;
2281 }
2282 } break;
2283 case LLM_ARCH_LFM2:
2284 {
2285 ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
2286 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2287 for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2288 hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2289 }
2290 hparams.n_layer_dense_lead = hparams.n_layer;
2291 switch (hparams.n_ff()) {
2292 case 4608: type = LLM_TYPE_350M; break;
2293 case 6912: type = LLM_TYPE_700M; break;
2294 case 8192: type = LLM_TYPE_1_2B; break;
2295 case 10752: type = LLM_TYPE_2_6B; break;
2296 default: type = LLM_TYPE_UNKNOWN;
2297 }
2298 } break;
2299 case LLM_ARCH_LFM2MOE:
2300 {
2301 ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
2302 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2303 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2304 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2305 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
2306
2307 for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2308 hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2309 }
2310
2311 type = LLM_TYPE_8B_A1B;
2312 } break;
2313 case LLM_ARCH_SMALLTHINKER:
2314 {
2315 const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
2316
2317 if (found_swa && hparams.n_swa > 0) {
2318 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2319 hparams.n_swa = 4096;
2320 hparams.set_swa_pattern(4, true);
2321
2322 hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2323 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2324 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2325 } else {
2326 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
2327 hparams.n_no_rope_layer_step = hparams.n_layer;
2328 }
2329
2330 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
2331 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2332 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
2333
2334 switch (hparams.n_layer) {
2335 case 32: type = LLM_TYPE_4B; break;
2336 case 52: type = LLM_TYPE_20B; break;
2337 default: type = LLM_TYPE_UNKNOWN;
2338 }
2339 } break;
2340 case LLM_ARCH_GROVEMOE:
2341 {
2342 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2343 ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
2344 ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
2345 ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
2346 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2347
2348 switch (hparams.n_layer) {
2349 case 48: type = LLM_TYPE_30B_A3B; break;
2350 default: type = LLM_TYPE_UNKNOWN;
2351 }
2352 } break;
2353 case LLM_ARCH_APERTUS:
2354 {
2355 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2356 ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
2357 ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
2358 ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
2359 ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
2360
2361 switch (hparams.n_layer) {
2362 case 32: type = LLM_TYPE_8B; break;
2363 default: type = LLM_TYPE_UNKNOWN;
2364 }
2365 } break;
2366 case LLM_ARCH_MINIMAX_M2:
2367 {
2368 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2369 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2370 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
2371
2372 switch (hparams.n_layer) {
2373 case 62: type = LLM_TYPE_230B_A10B; break;
2374 default: type = LLM_TYPE_UNKNOWN;
2375 }
2376 } break;
2377 case LLM_ARCH_COGVLM:
2378 {
2379 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2380 switch (hparams.n_layer) {
2381 case 32: type = LLM_TYPE_13B; break;
2382 default: type = LLM_TYPE_UNKNOWN;
2383 }
2384 } break;
2385 case LLM_ARCH_PANGU_EMBED:
2386 {
2387 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2388 switch (hparams.n_layer) {
2389 case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
2390 case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
2391 default: type = LLM_TYPE_UNKNOWN;
2392 }
2393 } break;
2394 case LLM_ARCH_QWEN3NEXT:
2395 {
2396 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
2397 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2398 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2399
2400 // Load linear attention (gated delta net) parameters
2401 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2402 ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
2403 ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
2404 ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2405 ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
2406
2407 // Mark recurrent layers (linear attention layers)
2408 {
2409 uint32_t full_attn_interval = 4;
2410 ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2411 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2412 hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2413 }
2414 }
2415
2416 switch (hparams.n_layer) {
2417 case 48: type = LLM_TYPE_80B_A3B; break;
2418 default: type = LLM_TYPE_UNKNOWN;
2419 }
2420 } break;
2421 case LLM_ARCH_QWEN35:
2422 {
2423 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2424 ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
2425
2426 // Load linear attention (gated delta net) parameters
2427 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2428 ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
2429 ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
2430 ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2431 ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
2432
2433 // Mark recurrent layers (linear attention layers)
2434 {
2435 uint32_t full_attn_interval = 4;
2436 ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2437 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2438 hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2439 }
2440 }
2441
2442 switch (hparams.n_layer) {
2443 case 24: type = LLM_TYPE_2B; break;
2444 default: type = LLM_TYPE_UNKNOWN;
2445 }
2446 } break;
2447 case LLM_ARCH_QWEN35MOE:
2448 {
2449 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
2450 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2451 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2452
2453 ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
2454
2455 // Load linear attention (gated delta net) parameters
2456 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2457 ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
2458 ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
2459 ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2460 ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
2461
2462 // Mark recurrent layers (linear attention layers)
2463 {
2464 uint32_t full_attn_interval = 4;
2465 ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2466 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2467 hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2468 }
2469 }
2470
2471 switch (hparams.n_layer) {
2472 case 28: type = LLM_TYPE_35B_A3B; break;
2473 case 48: type = LLM_TYPE_80B_A3B; break;
2474 default: type = LLM_TYPE_UNKNOWN;
2475 }
2476 } break;
2477 case LLM_ARCH_MISTRAL3:
2478 {
2479 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2480 ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
2481
2482 ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
2483 ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
2484 ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
2485
2486 hparams.f_attn_temp_offset = 0.0f;
2487
2488 // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
2489 if (hparams.f_attn_temp_scale != 0.0f) {
2490 hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
2491 if (hparams.n_attn_temp_floor_scale == 0) {
2492 throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
2493 }
2494 }
2495
2496 switch (hparams.n_layer) {
2497 case 26: type = LLM_TYPE_3B; break;
2498 case 34: type = LLM_TYPE_8B; break;
2499 case 40: type = LLM_TYPE_14B; break;
2500 default: type = LLM_TYPE_UNKNOWN;
2501 }
2502 } break;
2503 case LLM_ARCH_MIMO2:
2504 {
2505 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2506
2507 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2508
2509 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2510 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
2511 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
2512 ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
2513
2514 switch (hparams.n_layer) {
2515 case 48: type = LLM_TYPE_310B_A15B; break;
2516 default: type = LLM_TYPE_UNKNOWN;
2517 }
2518 } break;
2519 case LLM_ARCH_KIMI_LINEAR:
2520 {
2521 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2522 ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl);
2523 ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl);
2524 ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
2525 ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
2526 ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2527 ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda);
2528
2529 // MLA qk_rope_head_dim (for reference)
2530 // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
2531
2532 // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
2533 // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
2534 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2535 hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent
2536 }
2537
2538 // MoE parameters - Kimi uses moe_intermediate_size = 1024
2539 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2540 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
2541 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2542 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
2543 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
2544
2545 switch (hparams.n_layer) {
2546 case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
2547 default: type = LLM_TYPE_UNKNOWN;
2548 }
2549 } break;
2550 case LLM_ARCH_STEP35:
2551 {
2552 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2553
2554 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2555
2556 // MoE + SWA parameters
2557 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2558 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2559 ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
2560 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
2561 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
2562
2563 // Step35 uses sigmoid gating by default (if not set in GGUF)
2564 if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
2565 hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
2566 }
2567
2568 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
2569 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
2570 ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
2571 ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false);
2572 ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
2573
2574 switch (hparams.n_layer) {
2575 case 45: type = LLM_TYPE_196B_A11B; break;
2576 default: type = LLM_TYPE_UNKNOWN;
2577 }
2578 } break;
2579 default: throw std::runtime_error("unsupported model architecture");
2580 }
2581
2582 pimpl->n_bytes = ml.n_bytes;
2583
2584 pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
2585
2586 if (hparams.f_max_alibi_bias > 0.0f) {
2587 hparams.use_alibi = true;
2588 }
2589
2590 hparams.rope_type = llama_model_rope_type(this);
2591}
2592
2593void llama_model::load_vocab(llama_model_loader & ml) {
2594 const auto kv = LLM_KV(arch);
2595
2596 vocab.load(ml, kv);
2597}
2598
2599bool llama_model::load_tensors(llama_model_loader & ml) {
2600 const auto & split_mode = params.split_mode;
2601 const auto & use_mlock = params.use_mlock;
2602 const auto & tensor_split = params.tensor_split;
2603
2604 const int n_layer = hparams.n_layer;
2605 const int n_gpu_layers = this->n_gpu_layers();
2606
2607 const bool use_mmap_buffer = true;
2608
2609 LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
2610 __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
2611
2612 // build a list of buffer types for the CPU and GPU devices
2613 pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
2614 for (auto * dev : devices) {
2615 buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
2616 // add CPU buffer types as a fallback
2617 buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
2618 pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
2619 }
2620
2621 ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2622 if (cpu_dev == nullptr) {
2623 throw std::runtime_error(format("%s: no CPU backend found", __func__));
2624 }
2625
2626 // calculate the split points
2627 bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
2628 std::vector<float> splits(n_devices());
2629 if (all_zero) {
2630 // default split, by free memory
2631 for (size_t i = 0; i < n_devices(); ++i) {
2632 ggml_backend_dev_t dev = devices[i];
2633 size_t total;
2634 size_t free;
2635 ggml_backend_dev_memory(dev, &free, &total);
2636
2637 // devices can return 0 bytes for free and total memory if they do not
2638 // have any to report. in this case, we will use the host memory as a fallback
2639 // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
2640 if (free == 0 && total == 0) {
2641 ggml_backend_dev_memory(cpu_dev, &free, &total);
2642 }
2643 splits[i] = free;
2644 }
2645 } else {
2646 std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
2647 }
2648
2649 // sum and normalize the splits to get the split points
2650 float split_sum = 0.0f;
2651 for (size_t i = 0; i < n_devices(); ++i) {
2652 split_sum += splits[i];
2653 splits[i] = split_sum;
2654 }
2655 for (size_t i = 0; i < n_devices(); ++i) {
2656 splits[i] /= split_sum;
2657 }
2658
2659 const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
2660 const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
2661 auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
2662 const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
2663 if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
2664 LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
2665 return {cpu_dev, &pimpl->cpu_buft_list};
2666 }
2667 const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
2668 auto * dev = devices.at(layer_gpu);
2669 LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
2670 return {dev, &pimpl->gpu_buft_list.at(dev)};
2671 };
2672
2673 // assign the input layer
2674 // there is very little benefit to offloading the input layer, so always keep it on the CPU
2675 pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
2676
2677 // assign the repeating layers to the devices according to the splits
2678 pimpl->dev_layer.resize(n_layer);
2679 for (int il = 0; il < n_layer; ++il) {
2680 pimpl->dev_layer[il] = get_layer_buft_list(il);
2681 }
2682
2683 // assign the output layer
2684 pimpl->dev_output = get_layer_buft_list(n_layer);
2685
2686 // one ggml context per buffer type
2687 int max_n_tensors = ml.n_tensors;
2688 max_n_tensors += 1; // duplicated output tensor
2689 max_n_tensors += n_layer*2; // duplicated rope freq tensors
2690 const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
2691
2692 // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2693 struct ggml_backend_buft_comparator {
2694 bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2695 return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
2696 }
2697 };
2698 std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
2699
2700 auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
2701 auto it = ctx_map.find(buft);
2702 if (it == ctx_map.end()) {
2703 ggml_init_params params = {
2704 /*.mem_size =*/ ctx_size,
2705 /*.mem_buffer =*/ NULL,
2706 /*.no_alloc =*/ true,
2707 };
2708
2709 ggml_context * ctx = ggml_init(params);
2710 if (!ctx) {
2711 throw std::runtime_error(format("failed to create ggml context"));
2712 }
2713
2714 ctx_map.emplace(buft, ctx);
2715
2716 return ctx;
2717 }
2718 return it->second.get();
2719 };
2720
2721 const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
2722 const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2723 const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
2724
2725 // create tensors for the weights
2726 {
2727 // note: cast to int64_t since we will use these for the tensor dimensions
2728 const int64_t n_head = hparams.n_head();
2729 const int64_t n_head_kv = hparams.n_head_kv();
2730 const int64_t n_embd = hparams.n_embd;
2731 const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
2732 const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
2733 const int64_t n_embd_head_k = hparams.n_embd_head_k;
2734 const int64_t n_embd_head_v = hparams.n_embd_head_v;
2735 const int64_t n_ff = hparams.n_ff();
2736 const int64_t n_embd_gqa = n_embd_v_gqa;
2737 const int64_t n_vocab = vocab.n_tokens();
2738 const int64_t n_token_types = vocab.n_token_types();
2739 const int64_t n_rot = hparams.n_rot;
2740 const int64_t n_expert = hparams.n_expert;
2741 const int64_t n_expert_used = hparams.n_expert_used;
2742 const int64_t n_ctx_train = hparams.n_ctx_train;
2743
2744 if (n_expert > 0 && hparams.n_expert_used == 0) {
2745 throw std::runtime_error("model has expert layers but no expert layers are used");
2746 }
2747
2748 int n_moved_tensors = 0;
2749 ggml_tensor * first_moved_tensor = nullptr;
2750 ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
2751 ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
2752
2753 auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
2754 ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
2755
2756 if (!t_meta) {
2757 if (flags & TENSOR_NOT_REQUIRED) {
2758 return nullptr;
2759 }
2760 throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
2761 }
2762
2763 // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
2764 // the tensor is duplicated
2765 // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
2766 llm_tensor tn_tensor = tn.tensor;
2767 if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
2768 tn_tensor = LLM_TENSOR_OUTPUT;
2769 }
2770
2771 llm_tensor_info info;
2772 try {
2773 info = llm_tensor_info_for(tn_tensor);
2774 } catch (const std::out_of_range & e) {
2775 throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
2776 }
2777
2778 // skip unused tensors
2779 if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
2780 const size_t nbytes = ggml_nbytes(t_meta);
2781 LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
2782
2783 ml.size_data -= nbytes;
2784 ml.n_created++;
2785
2786 return nullptr;
2787 }
2788
2789 // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
2790 ggml_op op;
2791 bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
2792 if (bias) {
2793 if (info.op == GGML_OP_MUL_MAT_ID) {
2794 op = GGML_OP_ADD_ID;
2795 } else {
2796 op = GGML_OP_ADD;
2797 }
2798 } else {
2799 op = info.op;
2800 }
2801
2802 // sanity checks
2803 if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
2804 if (tn.bid != -1) {
2805 GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
2806 }
2807 } else {
2808 if (tn.bid == -1) {
2809 GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
2810 }
2811 }
2812
2813 // select the buffer type for this tensor
2814 buft_list_t * buft_list;
2815 switch (info.layer) {
2816 case LLM_TENSOR_LAYER_INPUT:
2817 buft_list = pimpl->dev_input.buft_list;
2818 break;
2819 case LLM_TENSOR_LAYER_OUTPUT:
2820 buft_list = pimpl->dev_output.buft_list;
2821 break;
2822 case LLM_TENSOR_LAYER_REPEATING:
2823 buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
2824 break;
2825 default:
2826 GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
2827 }
2828
2829 ggml_backend_buffer_type_t buft = nullptr;
2830
2831 // check overrides
2832 if (ml.tensor_buft_overrides) {
2833 std::string tensor_name = tn.str();
2834 for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2835 std::regex pattern(overrides->pattern);
2836 if (std::regex_search(tensor_name, pattern)) {
2837 if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2838 // when overriding to a CPU buffer, consider the extra buffer types
2839 buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2840 } else {
2841 buft = overrides->buft;
2842 }
2843
2844 LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2845 tensor_name.c_str(),
2846 ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
2847 ggml_backend_buft_name(buft));
2848 break;
2849 }
2850 }
2851 }
2852
2853 if (!buft) {
2854 buft = select_weight_buft(hparams, t_meta, op, *buft_list);
2855 if (!buft) {
2856 throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
2857 }
2858 }
2859
2860 // avoid using a host buffer when using mmap
2861 auto * buft_dev = ggml_backend_buft_get_device(buft);
2862 if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2863 auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2864 if (!cpu_dev) {
2865 throw std::runtime_error("no CPU backend found");
2866 }
2867 buft = ggml_backend_dev_buffer_type(cpu_dev);
2868 }
2869
2870 if (buft != buft_list->front().second) {
2871 n_moved_tensors++;
2872 if (!first_moved_tensor) {
2873 first_moved_tensor = t_meta;
2874 first_moved_from_buft = buft_list->front().second;
2875 first_moved_to_buft = buft;
2876 }
2877 }
2878
2879 ggml_context * ctx = ctx_for_buft(buft);
2880
2881 // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
2882 if (flags & TENSOR_DUPLICATED) {
2883 ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
2884 if (t) {
2885 return t;
2886 }
2887 }
2888 return ml.create_tensor(ctx, tn, ne, flags);
2889 };
2890
2891 layers.resize(n_layer);
2892
2893 // TODO: move to a separate function
2894 const auto tn = LLM_TN(arch);
2895 switch (arch) {
2896 case LLM_ARCH_LLAMA:
2897 case LLM_ARCH_REFACT:
2898 case LLM_ARCH_MINICPM:
2899 case LLM_ARCH_GRANITE:
2900 case LLM_ARCH_GRANITE_MOE:
2901 case LLM_ARCH_MISTRAL3:
2902 case LLM_ARCH_LLAMA_EMBED:
2903 {
2904 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2905
2906 // output
2907 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2908 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2909
2910 // if output is NULL, init from the input tok embed
2911 if (output == NULL) {
2912 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2913 }
2914
2915 for (int i = 0; i < n_layer; ++i) {
2916 auto & layer = layers[i];
2917
2918 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2919
2920 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2921 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2922 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2923 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2924
2925 // optional bias tensors
2926 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2927 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2928 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2929 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2930
2931 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2932
2933 if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
2934 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2935 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2936 }
2937 else {
2938 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2939 }
2940
2941 if (n_expert == 0) {
2942 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2943 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2944 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2945
2946 // optional MLP bias
2947 layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2948 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2949 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2950 } else {
2951 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2952 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
2953 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2954 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2955
2956 // For Granite MoE Shared
2957 if (hparams.n_ff_shexp > 0) {
2958 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
2959 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
2960 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
2961 }
2962 }
2963 }
2964 } break;
2965 case LLM_ARCH_LLADA:
2966 {
2967 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2968
2969 // output
2970 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2971 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
2972
2973 // if output is NULL, init from the input tok embed
2974 if (output == NULL) {
2975 output =
2976 create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
2977 }
2978
2979 for (int i = 0; i < n_layer; ++i) {
2980 auto & layer = layers[i];
2981
2982 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2983
2984 // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
2985 layer.wq =
2986 create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
2987 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
2988 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
2989 // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
2990 layer.wo =
2991 create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
2992 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2993
2994 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2995
2996 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
2997 TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2998
2999 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
3000 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
3001 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
3002
3003 // optional MLP bias
3004 layer.ffn_gate_b =
3005 create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
3006 layer.ffn_down_b =
3007 create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
3008 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
3009 }
3010 }
3011 break;
3012 case LLM_ARCH_LLADA_MOE:
3013 {
3014 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3015
3016 // output
3017 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3018 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3019
3020 GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
3021 GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
3022
3023 for (int i = 0; i < n_layer; ++i) {
3024 auto & layer = layers[i];
3025
3026 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3027
3028 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3029 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3030 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3031 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3032 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3033 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3034
3035 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3036
3037 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3038
3039 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3040
3041 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
3042 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
3043 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
3044 }
3045 } break;
3046 case LLM_ARCH_LLAMA4:
3047 {
3048 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3049
3050 // output
3051 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3052 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3053
3054 // if output is NULL, init from the input tok embed
3055 if (output == NULL) {
3056 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3057 }
3058
3059 for (int i = 0; i < n_layer; ++i) {
3060 bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
3061
3062 auto & layer = layers[i];
3063
3064 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3065
3066 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3067 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3068 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3069 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3070
3071 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3072
3073 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3074
3075 if (is_moe_layer) {
3076 int n_ff_exp = hparams.n_ff_exp;
3077
3078 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3079 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
3080 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
3081 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
3082
3083 // Shared expert
3084 const int64_t n_ff_shexp = n_ff_exp;
3085 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
3086 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
3087 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
3088 } else {
3089 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3090 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3091 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3092 }
3093 }
3094 } break;
3095 case LLM_ARCH_DECI:
3096 {
3097 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3098
3099 // output
3100 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3101 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3102
3103 // if output is NULL, init from the input tok embed
3104 if (output == NULL) {
3105 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3106 }
3107
3108 for (int i = 0; i < n_layer; ++i) {
3109 auto & layer = layers[i];
3110 const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
3111 const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
3112 const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
3113 const int64_t n_ff = hparams.n_ff(i);
3114 const int64_t n_head = hparams.n_head(i);
3115 const int64_t n_head_kv = hparams.n_head_kv(i);
3116
3117 if (n_head_kv == 0 && n_head > 0) {
3118 // linear attention for DeciLMCausalModel
3119 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3120 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3121 }
3122 else if (n_head_kv > 0) {
3123 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3124
3125 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3126 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3127 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3128 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3129 }
3130
3131 // optional bias tensors
3132 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3133 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3134 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3135 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3136
3137 if (n_ff > 0) {
3138 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3139 }
3140
3141 if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
3142 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3143 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3144 }
3145 else {
3146 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3147 }
3148
3149 if (n_ff > 0) {
3150 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3151 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3152 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3153 }
3154
3155 // optional MLP bias
3156 layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3157 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3158 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3159 }
3160 } break;
3161 case LLM_ARCH_MINICPM3:
3162 {
3163 const int64_t n_embd_head_qk_rope = hparams.n_rot;
3164 const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
3165
3166 const int64_t q_lora_rank = hparams.n_lora_q;
3167 const int64_t kv_lora_rank = hparams.n_lora_kv;
3168 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3169
3170 // output
3171 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3172 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3173
3174 // if output is NULL, init from the input tok embed
3175 if (output == NULL) {
3176 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3177 }
3178
3179 for (int i = 0; i < n_layer; ++i) {
3180 auto & layer = layers[i];
3181
3182 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3183 layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
3184
3185 layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
3186
3187 layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
3188 layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
3189
3190 layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3191 layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3192 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
3193
3194 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3195
3196 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3197 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3198 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3199
3200 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3201 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3202 }
3203 } break;
3204 case LLM_ARCH_GROK:
3205 {
3206 if (n_expert == 0) {
3207 throw std::runtime_error("Grok model cannot have zero experts");
3208 }
3209
3210 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3211
3212 // output
3213 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3214 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3215
3216 // if output is NULL, init from the input tok embed
3217 if (output == NULL) {
3218 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3219 }
3220
3221 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
3222 for (int i = 0; i < n_layer; ++i) {
3223 auto & layer = layers[i];
3224
3225 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3226
3227 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3228 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3229 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3230 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3231
3232 layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3233
3234 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3235
3236 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3237 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
3238 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3239
3240 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3241 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
3242 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
3243 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
3244
3245 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3246 if (!layer.ffn_post_norm) {
3247 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3248 }
3249 }
3250 } break;
3251 case LLM_ARCH_DBRX:
3252 {
3253 if (n_expert == 0) {
3254 throw std::runtime_error("DBRX model cannot have zero experts");
3255 }
3256
3257 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3258
3259 // output
3260 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3261 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3262
3263 for (int i = 0; i < n_layer; ++i) {
3264 auto & layer = layers[i];
3265
3266 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3267
3268 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3269 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3270
3271 layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3272
3273 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3274 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3275 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
3276 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3277 }
3278 } break;
3279 case LLM_ARCH_BAICHUAN:
3280 {
3281 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3282 {
3283 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3284 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3285 }
3286
3287 for (int i = 0; i < n_layer; ++i) {
3288 auto & layer = layers[i];
3289
3290 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3291
3292 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3293 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3294 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3295 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3296
3297 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3298
3299 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3300 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3301 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3302 }
3303 } break;
3304 case LLM_ARCH_FALCON:
3305 {
3306 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3307
3308 // output
3309 {
3310 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3311 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3312
3313 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3314 if (!output) {
3315 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
3316 }
3317 }
3318
3319 for (int i = 0; i < n_layer; ++i) {
3320 auto & layer = layers[i];
3321
3322 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3323 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3324
3325 layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3326 layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3327
3328 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3329 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3330
3331 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3332 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3333 }
3334 } break;
3335 case LLM_ARCH_STARCODER:
3336 {
3337 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3338 pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
3339
3340 // output
3341 {
3342 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3343 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3344 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3345 if (!output) {
3346 // needs to be on GPU
3347 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3348 }
3349
3350 }
3351
3352 for (int i = 0; i < n_layer; ++i) {
3353 auto & layer = layers[i];
3354
3355 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3356 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3357
3358 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3359 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
3360
3361 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3362 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
3363
3364 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3365 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
3366
3367 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3368 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3369
3370 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3371 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
3372 }
3373 } break;
3374 case LLM_ARCH_BERT:
3375 case LLM_ARCH_NOMIC_BERT:
3376 case LLM_ARCH_NOMIC_BERT_MOE:
3377 case LLM_ARCH_JINA_BERT_V3:
3378 {
3379 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3380 type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
3381
3382 if (arch == LLM_ARCH_BERT) {
3383 pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
3384
3385 cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3386 cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
3387
3388 cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3389 cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3390 }
3391
3392 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3393 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
3394
3395 for (int i = 0; i < n_layer; ++i) {
3396 auto & layer = layers[i];
3397
3398 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3399 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3400
3401 if (!layer.wqkv) {
3402 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3403 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3404
3405 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3406 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3407
3408 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3409 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3410 }
3411
3412 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3413 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3414
3415 layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3416 layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
3417
3418 if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
3419 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
3420 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
3421 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3422 } else {
3423 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3424 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3425 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3426 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3427
3428 if (arch == LLM_ARCH_NOMIC_BERT) {
3429 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3430 }
3431 }
3432
3433 layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3434 layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
3435 }
3436 } break;
3437 case LLM_ARCH_MODERN_BERT:
3438 {
3439 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3440 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3441
3442 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3443
3444 for(int i = 0; i < n_layer; ++i) {
3445 auto& layer = layers[i];
3446
3447 if ( i != 0 ) {
3448 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3449 } else{
3450 // layer 0 uses identity
3451 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3452 }
3453
3454
3455 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
3456 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3457
3458 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
3459 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3460 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3461 }
3462
3463 cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3464 cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3465 cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3466
3467 } break;
3468 case LLM_ARCH_NEO_BERT:
3469 {
3470 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3471
3472 cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3473 cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
3474
3475 cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3476 cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3477
3478 output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
3479
3480 for (int i = 0; i < n_layer; ++i) {
3481 auto & layer = layers[i];
3482
3483 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3484
3485 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3486 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3487
3488 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3489
3490 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
3491 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3492 }
3493 } break;
3494 case LLM_ARCH_JINA_BERT_V2:
3495 {
3496 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
3497 type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
3498
3499 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
3500 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
3501
3502 cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
3503 cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
3504 for (int i = 0; i < n_layer; ++i) {
3505 auto & layer = layers[i]; // JinaBertLayer
3506
3507 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3508 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3509
3510 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3511 layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3512
3513 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3514 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3515
3516 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3517 layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3518
3519 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3520 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3521
3522 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
3523 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
3524
3525 layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
3526 layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
3527
3528 layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3529 layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3530
3531 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3532
3533 const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
3534 ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
3535 const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
3536
3537 GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
3538 layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
3539 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
3540
3541 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3542 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3543
3544 layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3545 layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
3546 }
3547 } break;
3548 case LLM_ARCH_BLOOM:
3549 {
3550 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3551 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3552 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
3553
3554 // output
3555 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3556 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3557 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3558
3559 // if output is NULL, init from the input tok embed
3560 if (output == NULL) {
3561 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3562 }
3563
3564 for (int i = 0; i < n_layer; ++i) {
3565 auto & layer = layers[i];
3566
3567 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3568 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3569
3570 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3571 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
3572
3573 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3574 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
3575
3576 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3577 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
3578
3579 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3580 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3581
3582 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3583 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
3584 }
3585 } break;
3586 case LLM_ARCH_MPT:
3587 {
3588 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3589 pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
3590
3591 // output
3592 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3593 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
3594
3595 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3596 if (!output) {
3597 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
3598 }
3599
3600 for (int i = 0; i < n_layer; ++i) {
3601 auto & layer = layers[i];
3602
3603 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3604 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3605
3606 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3607 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3608
3609 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3610 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3611
3612 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3613 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3614
3615 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3616 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3617
3618 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3619 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3620
3621 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3622 layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3623
3624 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3625 layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3626
3627 // AWQ ScaleActivation layer
3628 layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
3629 }
3630 } break;
3631 case LLM_ARCH_STABLELM:
3632 {
3633 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3634
3635 // output
3636 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3637 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3638 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3639
3640 for (int i = 0; i < n_layer; ++i) {
3641 auto & layer = layers[i];
3642
3643 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3644 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3645
3646 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3647 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3648 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3649 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3650
3651 // optional bias tensors, present in Stable LM 2 1.6B
3652 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3653 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3654 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3655
3656 // optional q and k layernorms, present in StableLM 2 12B
3657 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
3658 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
3659
3660 // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
3661 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3662 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3663
3664 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3665 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3666 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3667 }
3668 } break;
3669 case LLM_ARCH_QWEN:
3670 {
3671 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3672
3673 // output
3674 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3675 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3676
3677 for (int i = 0; i < n_layer; ++i) {
3678 auto & layer = layers[i];
3679
3680 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3681
3682 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
3683 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
3684 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3685
3686 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3687
3688 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
3689 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
3690 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
3691 }
3692 } break;
3693 case LLM_ARCH_QWEN2:
3694 case LLM_ARCH_QWEN2VL:
3695 case LLM_ARCH_DREAM:
3696 {
3697 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3698
3699 // output
3700 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3701 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3702 output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
3703 // if output is NULL, init from the input tok embed
3704 if (output == NULL) {
3705 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3706 }
3707
3708 for (int i = 0; i < n_layer; ++i) {
3709 auto & layer = layers[i];
3710
3711 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3712
3713 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3714 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3715 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3716 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3717
3718 // optional bias tensors
3719 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3720 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3721 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3722
3723 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3724
3725 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3726 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3727 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3728 }
3729 } break;
3730 case LLM_ARCH_QWEN2MOE:
3731 {
3732 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3733
3734 // output
3735 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3736 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3737
3738 for (int i = 0; i < n_layer; ++i) {
3739 auto & layer = layers[i];
3740
3741 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3742
3743 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3744 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3745 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3746 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3747
3748 // optional bias tensors
3749 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3750 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3751 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3752
3753 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3754
3755 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3756
3757 if (n_expert == 0) {
3758 throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
3759 }
3760 if (n_expert_used == 0) {
3761 throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
3762 }
3763
3764 // MoE branch
3765 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3766
3767 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
3768 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
3769 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
3770
3771 // Shared expert branch
3772 const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
3773
3774 layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
3775 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
3776 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
3777 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
3778 }
3779 } break;
3780 case LLM_ARCH_QWEN3:
3781 case LLM_ARCH_QWEN3VL:
3782 {
3783 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3784
3785 // output
3786 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3787 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3788 // if output is NULL, init from the input tok embed
3789 if (output == NULL) {
3790 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3791 }
3792
3793 // output rerank head
3794 cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3795
3796 for (int i = 0; i < n_layer; ++i) {
3797 auto & layer = layers[i];
3798
3799 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3800
3801 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3802 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3803 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3804 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3805
3806 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3807 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3808
3809 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3810 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3811 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3812 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3813 }
3814 } break;
3815 case LLM_ARCH_QWEN3MOE:
3816 case LLM_ARCH_QWEN3VLMOE:
3817 case LLM_ARCH_RND1:
3818 {
3819 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3820
3821 // output
3822 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3823 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3824 // if output is NULL, init from the input tok embed
3825 if (output == NULL) {
3826 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3827 }
3828
3829 for (int i = 0; i < n_layer; ++i) {
3830 auto & layer = layers[i];
3831
3832 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3833
3834 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3835 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3836 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3837 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3838
3839 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3840 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3841
3842 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3843
3844 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3845
3846 if (n_expert == 0) {
3847 throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
3848 }
3849 if (n_expert_used == 0) {
3850 throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
3851 }
3852
3853 // MoE branch
3854 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3855
3856 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
3857 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
3858 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
3859 }
3860 } break;
3861 case LLM_ARCH_PHI2:
3862 {
3863 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3864
3865 // output
3866 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3867 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3868 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3869 output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
3870
3871 for (int i = 0; i < n_layer; ++i) {
3872 auto & layer = layers[i];
3873
3874 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3875 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3876
3877 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3878 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3879
3880 if (layer.wqkv == nullptr) {
3881 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3882 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3883
3884 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3885 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3886
3887 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3888 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3889 }
3890
3891 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3892 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
3893
3894 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3895 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3896
3897 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3898 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
3899 }
3900 } break;
3901 case LLM_ARCH_PHI3:
3902 {
3903 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3904
3905 // output
3906 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3907 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3908
3909 // if output is NULL, init from the input tok embed
3910 if (output == NULL) {
3911 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3912 }
3913
3914 for (int i = 0; i < n_layer; ++i) {
3915 auto & layer = layers[i];
3916
3917 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3918
3919 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
3920 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
3921
3922 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3923
3924 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
3925 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
3926
3927 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3928 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3929 }
3930 } break;
3931 case LLM_ARCH_PHIMOE:
3932 {
3933 const int64_t n_embd_head = n_embd / n_head;
3934
3935 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3936
3937 // output
3938 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3939 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3940 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
3941 output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
3942
3943 for (int i = 0; i < n_layer; ++i) {
3944 auto & layer = layers[i];
3945
3946 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3947 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
3948
3949 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
3950 if (layer.wqkv == nullptr) {
3951 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3952 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3953
3954 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3955 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3956
3957 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3958 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3959 }
3960 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
3961 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
3962
3963 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3964 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
3965
3966 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3967 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3968 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
3969 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3970
3971 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3972 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3973 }
3974 } break;
3975 case LLM_ARCH_PLAMO:
3976 {
3977 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3978
3979 // output
3980 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3981 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3982
3983 for (int i = 0; i < n_layer; ++i) {
3984 auto & layer = layers[i];
3985
3986 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3987
3988 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3989 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3990 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3991 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3992
3993 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3994 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3995 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3996 }
3997 } break;
3998 case LLM_ARCH_PLAMO2:
3999 {
4000 // mamba parameters
4001 const uint32_t d_conv = hparams.ssm_d_conv;
4002 const uint32_t d_state = hparams.ssm_d_state;
4003 const uint32_t num_heads = hparams.ssm_dt_rank;
4004 const uint32_t intermediate_size = hparams.ssm_d_inner;
4005 const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
4006
4007 // attention parameters
4008 const uint32_t qk_dim = hparams.n_embd_head_k;
4009 const uint32_t v_dim = hparams.n_embd_head_v;
4010
4011 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4012
4013 // output
4014 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4015 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4016 // if output is NULL, init from the input tok embed
4017 if (output == NULL) {
4018 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4019 }
4020
4021 for (int i = 0; i < n_layer; ++i) {
4022 auto & layer = layers[i];
4023 bool is_mamba_layer = hparams.is_recurrent(i);
4024
4025 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4026
4027 if (is_mamba_layer) {
4028 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
4029 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
4030
4031 layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
4032 layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
4033 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
4034
4035 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
4036 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
4037
4038 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
4039
4040 layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
4041 layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
4042 layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
4043 } else {
4044 const int64_t num_attention_heads = hparams.n_head(i);
4045 const int64_t q_num_heads = num_attention_heads;
4046 const int64_t num_key_value_heads = hparams.n_head_kv(i);
4047 const int64_t k_num_heads = num_key_value_heads;
4048 const int64_t v_num_heads = num_key_value_heads;
4049 const int64_t q_proj_dim = q_num_heads * qk_dim;
4050 const int64_t k_proj_dim = k_num_heads * qk_dim;
4051 const int64_t v_proj_dim = v_num_heads * v_dim;
4052
4053 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
4054 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
4055 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
4056 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
4057 }
4058
4059 // All layers have post-attention norm, FFN norm, and FFN tensors
4060 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
4061 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4062 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4063 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
4064 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
4065 }
4066 } break;
4067 case LLM_ARCH_PLAMO3:
4068 {
4069 const int64_t head_dim_q = hparams.n_embd_head_k;
4070 const int64_t head_dim_v = hparams.n_embd_head_v;
4071
4072 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4073
4074 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4075 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4076 if (output == NULL) {
4077 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4078 }
4079
4080 for (int i = 0; i < n_layer; ++i) {
4081 auto & layer = layers[i];
4082
4083 const int64_t num_attention_heads = hparams.n_head(i);
4084 const int64_t num_key_value_heads = hparams.n_head_kv(i);
4085 const int64_t q_proj_dim = num_attention_heads * head_dim_q;
4086 const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
4087 const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
4088 const int64_t n_ff_cur = hparams.n_ff(i);
4089
4090 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4091 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
4092 {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
4093 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
4094 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
4095 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
4096 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
4097
4098 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4099 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
4100
4101 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur * 2}, 0);
4102 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
4103 }
4104 } break;
4105 case LLM_ARCH_GPT2:
4106 {
4107 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4108 pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
4109
4110 // output
4111 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4112 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
4113 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4114
4115 // if output is NULL, init from the input tok embed
4116 if (output == NULL) {
4117 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4118 }
4119
4120 for (int i = 0; i < n_layer; ++i) {
4121 auto & layer = layers[i];
4122
4123 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4124 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
4125
4126 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4127 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
4128
4129 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4130 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
4131
4132 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4133 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
4134
4135 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4136 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
4137
4138 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4139 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
4140 }
4141 } break;
4142 case LLM_ARCH_CODESHELL:
4143 {
4144 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4145
4146 // if tok embd is NULL, init from output
4147 if (tok_embd == NULL) {
4148 tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4149 }
4150
4151 // output
4152 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4153 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
4154 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4155
4156 for (int i = 0; i < n_layer; ++i) {
4157 auto & layer = layers[i];
4158
4159 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4160 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
4161
4162 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4163 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
4164
4165 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4166 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
4167
4168 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4169 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
4170
4171 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4172 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
4173
4174 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4175 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
4176 }
4177 } break;
4178 case LLM_ARCH_ORION:
4179 {
4180 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4181
4182 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4183 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
4184 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4185
4186 for (int i = 0; i < n_layer; ++i) {
4187 auto & layer = layers[i];
4188
4189 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4190 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
4191
4192 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4193 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4194 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4195 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4196
4197 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4198 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
4199
4200 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4201 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4202 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4203 }
4204 } break;
4205 case LLM_ARCH_INTERNLM2:
4206 {
4207 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4208
4209 // output
4210 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4211 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4212
4213 for (int i = 0; i < n_layer; ++i) {
4214 auto & layer = layers[i];
4215
4216 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4217 // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4218 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4219 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4220 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4221
4222 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4223 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4224 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4225 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4226 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4227 }
4228 } break;
4229 case LLM_ARCH_GEMMA:
4230 {
4231 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4232
4233 // output
4234 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4235 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
4236
4237 for (int i = 0; i < n_layer; ++i) {
4238 auto & layer = layers[i];
4239
4240 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4241
4242 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4243 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4244 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4245 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4246
4247 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4248 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4249 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4250 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4251 }
4252 } break;
4253 case LLM_ARCH_GEMMA2:
4254 {
4255 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4256
4257 // output
4258 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4259 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
4260
4261 for (int i = 0; i < n_layer; ++i) {
4262 auto & layer = layers[i];
4263
4264 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4265
4266 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4267 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4268 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4269 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4270 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4271
4272 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4273 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4274 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4275 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4276 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4277 }
4278 } break;
4279 case LLM_ARCH_GEMMA3:
4280 case LLM_ARCH_GEMMA_EMBEDDING:
4281 {
4282 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4283
4284 // output
4285 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4286 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4287
4288 // if output is NULL, init from the input tok embed
4289 if (output == NULL) {
4290 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4291 }
4292
4293 // Dense linear weights
4294 dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
4295 dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
4296
4297
4298 for (int i = 0; i < n_layer; ++i) {
4299 auto & layer = layers[i];
4300
4301 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4302
4303 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4304 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4305 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4306 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4307
4308 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4309 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4310 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4311
4312 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4313 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4314 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4315 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4316 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4317 }
4318 } break;
4319 case LLM_ARCH_GEMMA3N:
4320 {
4321 const int64_t n_altup = hparams.n_altup;
4322 const int64_t laurel_rank = hparams.laurel_rank;
4323 const int64_t n_embd_altup = hparams.n_embd_altup;
4324
4325 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4326 // if output is NULL, init from the input tok embed
4327 if (output == NULL) {
4328 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4329 }
4330
4331 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4332 tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
4333
4334 altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
4335 altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
4336 per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
4337 per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
4338
4339 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4340
4341 for (int i = 0; i < n_layer; ++i) {
4342 auto & layer = layers[i];
4343
4344 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4345
4346 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4347 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4348 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4349 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4350
4351 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4352 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4353 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4354
4355 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4356 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4357 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4358 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4359 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4360
4361 // altup & laurel
4362 layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
4363 layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
4364 layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
4365 layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
4366 layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
4367 layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
4368 layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
4369 layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
4370 layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
4371 layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
4372 layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
4373 }
4374 } break;
4375 case LLM_ARCH_STARCODER2:
4376 {
4377 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4378
4379 // output
4380 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4381 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
4382
4383 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4384 // if output is NULL, init from the input tok embed
4385 if (output == NULL) {
4386 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4387 }
4388
4389 for (int i = 0; i < n_layer; ++i) {
4390 auto & layer = layers[i];
4391
4392 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4393 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
4394
4395 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4396 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4397 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4398 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4399
4400 // optional bias tensors
4401 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
4402 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
4403 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
4404 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
4405
4406 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4407 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
4408
4409 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4410 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4411
4412 // optional bias tensors
4413 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
4414 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
4415 }
4416 } break;
4417 case LLM_ARCH_MAMBA:
4418 {
4419 const int64_t d_conv = hparams.ssm_d_conv;
4420 const int64_t d_inner = hparams.ssm_d_inner;
4421 const int64_t d_state = hparams.ssm_d_state;
4422 const int64_t dt_rank = hparams.ssm_dt_rank;
4423
4424 // only an expansion factor of 2 is supported for now
4425 if (2 * n_embd != d_inner) {
4426 throw std::runtime_error("only an expansion factor of 2 is supported for now");
4427 }
4428
4429 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4430
4431 // output
4432 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4433
4434 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4435 // if output is NULL, init from the input tok embed, duplicated to allow offloading
4436 if (output == NULL) {
4437 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4438 }
4439
4440 for (int i = 0; i < n_layer; ++i) {
4441 auto & layer = layers[i];
4442
4443 // norm
4444 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4445
4446 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
4447
4448 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
4449 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
4450
4451 layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4452
4453 layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4454 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4455
4456 // no "weight" suffix for these
4457 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4458 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4459
4460 // out_proj
4461 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4462 }
4463 } break;
4464 case LLM_ARCH_MAMBA2:
4465 {
4466 const int64_t d_conv = hparams.ssm_d_conv;
4467 const int64_t d_inner = hparams.ssm_d_inner;
4468 const int64_t d_state = hparams.ssm_d_state;
4469 const int64_t n_head = hparams.ssm_dt_rank;
4470 const int64_t n_group = hparams.ssm_n_group;
4471 const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
4472
4473 // only an expansion factor of 2 is supported for now
4474 GGML_ASSERT(2 * n_embd == d_inner);
4475
4476 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4477
4478 // output
4479 {
4480 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4481
4482 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4483 // if output is NULL, init from the input tok embed, duplicated to allow offloading
4484 if (output == NULL) {
4485 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4486 }
4487 }
4488
4489 for (int i = 0; i < n_layer; ++i) {
4490 auto & layer = layers[i];
4491
4492 // norm
4493 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4494
4495 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4496
4497 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4498 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
4499
4500 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
4501
4502 // no "weight" suffix for these
4503 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
4504 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
4505
4506 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4507
4508 // out_proj
4509 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4510 }
4511 } break;
4512 case LLM_ARCH_JAMBA:
4513 {
4514 const int64_t d_conv = hparams.ssm_d_conv;
4515 const int64_t d_inner = hparams.ssm_d_inner;
4516 const int64_t d_state = hparams.ssm_d_state;
4517 const int64_t dt_rank = hparams.ssm_dt_rank;
4518
4519 // only an expansion factor of 2 is supported for now
4520 GGML_ASSERT(2 * n_embd == d_inner);
4521
4522 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4523
4524 // output
4525 {
4526 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4527
4528 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4529 // if output is NULL, init from the input tok embed, duplicated to allow offloading
4530 if (output == NULL) {
4531 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4532 }
4533 }
4534
4535 for (int i = 0; i < n_layer; ++i) {
4536 const int64_t n_head_kv = hparams.n_head_kv(i);
4537 const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
4538
4539 auto & layer = layers[i];
4540
4541 // norm
4542 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4543
4544 if (n_head_kv == 0) {
4545 // Mamba layer
4546 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
4547
4548 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
4549 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
4550
4551 layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4552
4553 layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
4554
4555 layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4556 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4557
4558 layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
4559 layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
4560
4561 // no "weight" suffix for these
4562 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4563 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4564
4565 // out_proj
4566 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4567 } else {
4568 // Attention layers
4569
4570 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4571 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4572 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4573 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4574 }
4575
4576 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4577
4578 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
4579
4580 if (layer.ffn_gate_inp) {
4581 // MoE
4582 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4583 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
4584 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4585 } else {
4586 // FFN (no MoE)
4587 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4588 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4589 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4590 }
4591 }
4592 } break;
4593 case LLM_ARCH_GRANITE_HYBRID:
4594 {
4595 // mamba2 Mixer SSM params
4596 // NOTE: int64_t for tensor dimensions
4597 const int64_t d_conv = hparams.ssm_d_conv;
4598 const int64_t d_inner = hparams.ssm_d_inner;
4599 const int64_t d_state = hparams.ssm_d_state;
4600 const int64_t n_ssm_head = hparams.ssm_dt_rank;
4601 const int64_t n_group = hparams.ssm_n_group;
4602 const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
4603
4604 // only an expansion factor of 2 is supported for now
4605 GGML_ASSERT(2 * n_embd == d_inner);
4606
4607 // embeddings
4608 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4609
4610 // output
4611 {
4612 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4613 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4614 // if output is NULL, init from the input tok embed, duplicated to allow offloading
4615 if (output == NULL) {
4616 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4617 }
4618 }
4619
4620 for (int i = 0; i < n_layer; ++i) {
4621 auto & layer = layers[i];
4622
4623 // norm
4624 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4625
4626 if (hparams.is_recurrent(i)) {
4627 // ssm layers
4628 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4629
4630 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4631 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
4632
4633 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
4634
4635 // no "weight" suffix for these
4636 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
4637 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
4638
4639 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4640
4641 // out_proj
4642 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4643 } else {
4644 // attention layers (with optional bias)
4645 const int64_t n_head_i = hparams.n_head(i);
4646 const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
4647 const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
4648 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
4649 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
4650 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
4651 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
4652 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4653 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
4654 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
4655 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4656 }
4657
4658 // feed forward (w/ optional biases)
4659 if (n_expert > 0) {
4660 // MoE FFN
4661 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4662 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4663 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4664 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
4665 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
4666 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4667
4668 // For Granite MoE Shared
4669 if (hparams.n_ff_shexp > 0) {
4670 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4671 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4672 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
4673 }
4674 } else {
4675 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4676 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4677 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4678 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4679 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4680 layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4681 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4682 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4683 }
4684 }
4685 } break;
4686 case LLM_ARCH_XVERSE:
4687 {
4688 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4689
4690 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4691 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4692
4693 for (int i = 0; i < n_layer; ++i) {
4694 auto & layer = layers[i];
4695
4696 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4697
4698 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4699 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4700 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4701 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4702
4703 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4704 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4705 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4706 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4707 }
4708 } break;
4709 case LLM_ARCH_COMMAND_R:
4710 {
4711 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4712
4713 // output
4714 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4715 // init output from the input tok embed
4716 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4717
4718 for (int i = 0; i < n_layer; ++i) {
4719 auto & layer = layers[i];
4720
4721 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4722
4723 if (n_layer >= 64){
4724 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
4725 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
4726 }
4727
4728 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4729 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4730 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4731 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4732
4733 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4734 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4735 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4736 }
4737 } break;
4738 case LLM_ARCH_COHERE2:
4739 {
4740 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4741
4742 // output
4743 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4744 // init output from the input tok embed
4745 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
4746 TENSOR_DUPLICATED);
4747
4748 for (int i = 0; i < n_layer; ++i) {
4749 auto & layer = layers[i];
4750
4751 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
4752
4753 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
4754 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
4755 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
4756 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
4757
4758 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
4759 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
4760 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
4761 }
4762 }
4763 break;
4764 case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
4765 {
4766 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4767
4768 // output
4769 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4770 // if output is NULL, init from the input tok embed
4771 if (output == NULL) {
4772 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4773 }
4774
4775 for (int i = 0; i < n_layer; ++i) {
4776 auto & layer = layers[i];
4777
4778 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4779 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4780 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4781 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4782
4783 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4784 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4785 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4786 }
4787 } break;
4788 case LLM_ARCH_OLMO2:
4789 {
4790 const int64_t n_embd_head = n_embd / n_head;
4791
4792 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4793
4794 // output
4795 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4796 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4797
4798 for (int i = 0; i < n_layer; ++i) {
4799 auto & layer = layers[i];
4800
4801 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4802 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4803 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4804 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4805 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
4806 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
4807 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4808
4809 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4810 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4811 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4812 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4813 }
4814 } break;
4815 case LLM_ARCH_SEED_OSS:
4816 {
4817 const uint32_t head_dim = hparams.n_embd_head_k;
4818 const int64_t n_qo_dim = n_head * head_dim;
4819 const int64_t n_kv_dim = n_head_kv * head_dim;
4820
4821 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4822
4823 // output
4824 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4825 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4826 // if output is NULL, init from the input tok embed
4827 if (output == NULL) {
4828 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4829 }
4830
4831 for (int i = 0; i < n_layer; ++i) {
4832 auto & layer = layers[i];
4833
4834 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
4835 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
4836 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
4837 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
4838
4839 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
4840 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4841 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4842
4843 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4844 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4845
4846 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4847 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4848 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4849 }
4850 } break;
4851
4852 case LLM_ARCH_OLMOE:
4853 {
4854 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4855
4856 // output
4857 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4858 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4859
4860 for (int i = 0; i < n_layer; ++i) {
4861 auto & layer = layers[i];
4862
4863 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4864
4865 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4866 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4867 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4868 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4869 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
4870 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
4871
4872 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4873
4874 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4875
4876 if (n_expert == 0) {
4877 throw std::runtime_error("n_expert must be > 0");
4878 }
4879 if (n_expert_used == 0) {
4880 throw std::runtime_error("n_expert_used must be > 0");
4881 }
4882
4883 // MoE branch
4884 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4885 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
4886 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4887 }
4888 } break;
4889 case LLM_ARCH_OPENELM:
4890 {
4891 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4892
4893 // output
4894 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4895 // init output from the input tok embed
4896 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4897
4898 for (int i = 0; i < n_layer; ++i) {
4899 const int64_t n_head = hparams.n_head(i);
4900 const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
4901 const int64_t n_ff = hparams.n_ff(i);
4902
4903 auto & layer = layers[i];
4904
4905 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4906
4907 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
4908 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4909 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4910 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
4911
4912 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4913 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4914 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4915 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4916 }
4917 } break;
4918 case LLM_ARCH_GPTNEOX:
4919 {
4920 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4921
4922 // output
4923 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4924 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
4925 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4926
4927 for (int i = 0; i < n_layer; ++i) {
4928 auto & layer = layers[i];
4929
4930 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4931 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
4932
4933 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4934 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
4935
4936 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4937 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
4938
4939 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4940 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
4941
4942 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4943 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
4944
4945 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4946 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
4947 }
4948 } break;
4949 case LLM_ARCH_ARCTIC:
4950 {
4951 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4952
4953 // output
4954 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4955 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4956
4957 // if output is NULL, init from the input tok embed
4958 if (output == NULL) {
4959 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4960 }
4961
4962 for (int i = 0; i < n_layer; ++i) {
4963 auto & layer = layers[i];
4964
4965 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4966
4967 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4968 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4969 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4970 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4971
4972 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4973
4974 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
4975 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
4976 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
4977
4978 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4979 layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
4980 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4981 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
4982 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4983 }
4984 } break;
4985 case LLM_ARCH_DEEPSEEK:
4986 {
4987
4988 const int64_t n_ff_exp = hparams.n_ff_exp;
4989 const int64_t n_expert_shared = hparams.n_expert_shared;
4990
4991 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4992
4993 // output
4994 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4995 // try to load output.weight, if not found, use token_embd (tied embeddings)
4996 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4997 if (!output) {
4998 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4999 }
5000
5001 for (int i = 0; i < n_layer; ++i) {
5002 auto & layer = layers[i];
5003
5004 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5005
5006 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
5007 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
5008 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
5009 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5010 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5011
5012 if (i < (int) hparams.n_layer_dense_lead) {
5013 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5014 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5015 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5016 } else {
5017 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5018
5019 if (n_expert == 0) {
5020 throw std::runtime_error("n_expert must be > 0");
5021 }
5022 if (n_expert_used == 0) {
5023 throw std::runtime_error("n_expert_used must be > 0");
5024 }
5025
5026 // MoE branch
5027 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5028 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5029 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5030
5031 // Shared expert branch
5032 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5033 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
5034 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5035 }
5036 }
5037 } break;
5038 case LLM_ARCH_DEEPSEEK2:
5039 {
5040 const bool is_mla = hparams.is_mla();
5041
5042 // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
5043 const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
5044 const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
5045
5046 const int64_t n_embd_head_qk_rope = hparams.n_rot;
5047 const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
5048
5049 const int64_t q_lora_rank = hparams.n_lora_q;
5050 const int64_t kv_lora_rank = hparams.n_lora_kv;
5051
5052 const int64_t n_ff_exp = hparams.n_ff_exp;
5053 const int64_t n_expert_shared = hparams.n_expert_shared;
5054
5055 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5056
5057 // output
5058 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5059 // try to load output.weight, if not found, use token_embd (tied embeddings)
5060 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5061 if (!output) {
5062 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5063 }
5064
5065 for (int i = 0; i < n_layer; ++i) {
5066 auto & layer = layers[i];
5067
5068 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5069 if (q_lora_rank > 0) {
5070 layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
5071 }
5072
5073 layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
5074
5075 if (q_lora_rank > 0) {
5076 layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
5077 layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
5078 } else {
5079 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
5080 }
5081
5082 layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
5083
5084 // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
5085 if (is_mla) {
5086 layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
5087 layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
5088 } else {
5089 layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
5090 }
5091
5092 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
5093
5094 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5095
5096 if (i < (int) hparams.n_layer_dense_lead) {
5097 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5098 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5099 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5100 } else {
5101 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5102 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5103
5104 if (n_expert == 0) {
5105 throw std::runtime_error("n_expert must be > 0");
5106 }
5107 if (n_expert_used == 0) {
5108 throw std::runtime_error("n_expert_used must be > 0");
5109 }
5110
5111 // MoE branch
5112 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5113 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5114 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5115
5116 // Shared expert branch
5117 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5118 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
5119 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5120 }
5121 }
5122 } break;
5123 case LLM_ARCH_PLM:
5124 {
5125 const int64_t n_embd_head_qk_rope = hparams.n_rot;
5126 const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
5127 const int64_t kv_lora_rank = hparams.n_lora_kv;
5128
5129 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5130
5131 // output
5132 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5133 // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5134 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5135
5136 for (int i = 0; i < n_layer; ++i) {
5137 auto & layer = layers[i];
5138
5139 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5140
5141 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5142 layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
5143 layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
5144 layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
5145 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
5146
5147 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5148 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5149 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5150 }
5151 } break;
5152 case LLM_ARCH_BITNET:
5153 {
5154 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5155
5156 // output
5157 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5158
5159 for (int i = 0; i < n_layer; ++i) {
5160 auto & layer = layers[i];
5161
5162 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5163 layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
5164
5165 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
5166 layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5167 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
5168 layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5169 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
5170 layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5171 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5172 layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5173
5174 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5175 layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
5176
5177 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5178 layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5179 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5180 layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5181 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5182 layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5183 }
5184 } break;
5185 case LLM_ARCH_T5:
5186 {
5187 const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
5188
5189 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5190
5191 // output
5192 output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
5193 output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
5194
5195 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5196 // if output is NULL, init from the input tok embed
5197 if (output == NULL) {
5198 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5199 }
5200
5201 // n_layer: number of encoder_layers
5202 // dec_n_layer: number of decoder_layers
5203 const int dec_n_layer = hparams.dec_n_layer;
5204 if (dec_n_layer > n_layer) {
5205 layers.resize(dec_n_layer);
5206 }
5207
5208 // load encoder layers
5209 for (int i = 0; i < n_layer; ++i) {
5210 auto & layer = layers[i];
5211
5212 layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
5213 layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5214
5215 layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5216 layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5217 layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5218 layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5219
5220 layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
5221 layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
5222 layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5223 layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5224 }
5225
5226 // load decoder layers
5227 for (int i = 0; i < dec_n_layer; ++i) {
5228 auto & layer = layers[i];
5229
5230 layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
5231 layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5232
5233 layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5234 layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5235 layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5236 layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5237
5238 layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
5239 // this tensor seems to be unused in HF transformers implementation
5240 layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5241
5242 layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5243 layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5244 layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5245 layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5246
5247 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
5248 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
5249 layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5250 layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5251 }
5252 } break;
5253 case LLM_ARCH_T5ENCODER:
5254 {
5255 const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
5256
5257 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5258
5259 // output
5260 output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
5261 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5262 // if output is NULL, init from the input tok embed
5263 if (output == NULL) {
5264 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5265 }
5266
5267 for (int i = 0; i < n_layer; ++i) {
5268 auto & layer = layers[i];
5269
5270 layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
5271 layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5272
5273 layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5274 layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5275 layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5276 layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5277
5278 layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
5279 layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
5280 layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5281 layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5282 }
5283 } break;
5284 case LLM_ARCH_JAIS:
5285 {
5286 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5287
5288 // output
5289 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5290 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5291 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5292
5293 for (int i = 0; i < n_layer; ++i) {
5294 auto & layer = layers[i];
5295
5296 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5297 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5298
5299 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
5300 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
5301
5302 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5303 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
5304
5305 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5306 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
5307
5308 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5309 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
5310
5311 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5312 layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
5313
5314 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5315 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
5316 }
5317 } break;
5318 case LLM_ARCH_CHATGLM:
5319 {
5320 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5321
5322 // output
5323 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5324 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5325 // if output is NULL, init from the input tok embed
5326 if (output == NULL) {
5327 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5328 }
5329
5330 for (int i = 0; i < n_layer; ++i) {
5331 auto & layer = layers[i];
5332
5333 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5334 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5335 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5336
5337 if (layer.wqkv == nullptr) {
5338 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5339 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5340 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5341 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5342 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5343 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5344 }
5345
5346 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5347
5348 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5349
5350 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
5351
5352 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5353 }
5354 } break;
5355 case LLM_ARCH_GLM4:
5356 {
5357 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5358
5359 // output
5360 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5361 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5362 // if output is NULL, init from the input tok embed
5363 if (output == NULL) {
5364 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5365 }
5366
5367 for (int i = 0; i < n_layer; ++i) {
5368 auto & layer = layers[i];
5369
5370 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5371 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5372 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5373
5374 if (layer.wqkv == nullptr) {
5375 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5376 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5377 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5378 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5379 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5380 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5381 }
5382
5383 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5384
5385 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5386
5387 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5388 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5389 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
5390
5391 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5392 }
5393 } break;
5394 case LLM_ARCH_GLM4_MOE:
5395 {
5396 const int64_t n_expert = hparams.n_expert;
5397 const int64_t n_expert_used = hparams.n_expert_used;
5398 const int64_t n_expert_shared = hparams.n_expert_shared;
5399
5400 GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
5401 GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
5402
5403 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
5404
5405 // output
5406 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5407 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
5408 // if output is NULL, init from the input tok embed
5409 if (output == NULL) {
5410 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
5411 }
5412
5413 // Load ALL tensors including NextN layer to satisfy total tensor count
5414 // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
5415 for (int i = 0; i < n_layer; ++i) {
5416 int flags = 0;
5417 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5418 // skip all tensors in the NextN layers
5419 flags |= TENSOR_SKIP;
5420 }
5421
5422 auto & layer = layers[i];
5423
5424 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
5425
5426 // GLM-style attention with bias terms
5427 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
5428 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
5429 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
5430 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
5431 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
5432 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
5433
5434 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
5435
5436 // K/Q norm tensors (optional for GLM-4.5 355B variant)
5437 layer.attn_q_norm = create_tensor(
5438 tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
5439 layer.attn_k_norm = create_tensor(
5440 tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
5441
5442 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
5443
5444 // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
5445 // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
5446 const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
5447
5448 if (use_moe) {
5449 // MoE layers
5450 layer.ffn_gate_inp =
5451 create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
5452 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
5453
5454 // MoE branch
5455 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5456
5457 layer.ffn_gate_exps = create_tensor(
5458 tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
5459 layer.ffn_down_exps = create_tensor(
5460 tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
5461 layer.ffn_up_exps = create_tensor(
5462 tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
5463
5464 // Shared expert
5465 if (n_expert_shared > 0) {
5466 const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
5467 layer.ffn_gate_shexp = create_tensor(
5468 tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
5469 layer.ffn_down_shexp = create_tensor(
5470 tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
5471 layer.ffn_up_shexp = create_tensor(
5472 tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
5473 }
5474 } else {
5475 // Dense layers (first k layers) - GLM uses separate gate/up projections
5476 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
5477 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
5478 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
5479 }
5480
5481 // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5482 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5483 layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5484 layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5485 layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5486
5487 // Optional tensors
5488 layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5489 layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5490 layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
5491 }
5492 }
5493 }
5494 break;
5495 case LLM_ARCH_NEMOTRON:
5496 {
5497 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5498
5499 // output
5500 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5501 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5502 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5503
5504 for (int i = 0; i < n_layer; ++i) {
5505 auto & layer = layers[i];
5506
5507 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5508 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5509
5510 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
5511 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
5512 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
5513 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5514
5515 // optional bias tensors
5516 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5517 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5518 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5519 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5520
5521 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5522 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
5523
5524 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5525 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5526
5527 // optional MLP bias
5528 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5529 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
5530 }
5531 } break;
5532 case LLM_ARCH_NEMOTRON_H:
5533 case LLM_ARCH_NEMOTRON_H_MOE:
5534 {
5535 // mamba2 Mixer SSM params
5536 // NOTE: int64_t for tensor dimensions
5537 const int64_t d_conv = hparams.ssm_d_conv;
5538 const int64_t d_inner = hparams.ssm_d_inner;
5539 const int64_t d_state = hparams.ssm_d_state;
5540 const int64_t n_ssm_head = hparams.ssm_dt_rank;
5541 const int64_t n_group = hparams.ssm_n_group;
5542 const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5543
5544 // embeddings
5545 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5546
5547 // output
5548 {
5549 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5550 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5551 // if output is NULL, init from the input tok embed, duplicated to allow offloading
5552 if (output == NULL) {
5553 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5554 }
5555 }
5556
5557 for (int i = 0; i < n_layer; ++i) {
5558 auto & layer = layers[i];
5559
5560 // all blocks use the attn norm
5561 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5562
5563 if (hparams.is_recurrent(i)) {
5564 // ssm layers
5565 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
5566
5567 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
5568 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
5569
5570 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
5571
5572 // no "weight" suffix for these
5573 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
5574 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
5575
5576 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
5577
5578 // out_proj
5579 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
5580 } else if (hparams.n_ff(i) == 0) {
5581 // attention layers (with optional bias)
5582 const int64_t n_head_i = hparams.n_head(i);
5583 const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
5584 const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
5585 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
5586 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
5587 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
5588 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
5589 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5590 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
5591 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
5592 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5593 } else {
5594 if (n_expert != 0) {
5595 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5596 const int64_t n_ff_shexp = hparams.n_ff_shexp;
5597
5598 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5599 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
5600
5601 // MoE branch
5602 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5603 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5604
5605 // Shared expert branch
5606 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
5607 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
5608
5609 } else {
5610 // mlp layers
5611 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
5612 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
5613 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5614 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
5615 }
5616 }
5617 }
5618 } break;
5619 case LLM_ARCH_EXAONE:
5620 {
5621 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5622
5623 // output
5624 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5625 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5626
5627 // if output is NULL, init from the input tok embed
5628 if (output == NULL) {
5629 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5630 }
5631
5632 for (int i = 0; i < n_layer; ++i) {
5633 auto & layer = layers[i];
5634
5635 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5636
5637 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5638 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5639 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5640 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5641
5642 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5643 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5644 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5645 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5646 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5647 }
5648 } break;
5649 case LLM_ARCH_EXAONE4:
5650 {
5651 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5652
5653 // output
5654 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5655 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5656
5657 // if output is NULL, init from the input tok embed
5658 if (output == NULL) {
5659 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5660 }
5661
5662 for (int i = 0; i < n_layer; ++i) {
5663 auto & layer = layers[i];
5664
5665 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5666 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5667 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5668 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5669
5670 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5671
5672 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5673 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5674 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5675
5676 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5677 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5678 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5679 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5680 }
5681 } break;
5682 case LLM_ARCH_EXAONE_MOE:
5683 {
5684 const int64_t n_ff_exp = hparams.n_ff_exp;
5685 const int64_t n_expert = hparams.n_expert;
5686 const int64_t n_expert_used = hparams.n_expert_used;
5687 const int64_t n_ff_shexp = hparams.n_ff_shexp;
5688 const int64_t head_dim = hparams.n_embd_head_k;
5689 const int64_t n_qo_dim = n_head * head_dim;
5690 const int64_t n_kv_dim = n_head_kv * head_dim;
5691
5692 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5693
5694 // output
5695 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5696 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5697
5698 if (output == NULL) {
5699 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5700 }
5701
5702 for (int i = 0; i < n_layer; ++i) {
5703 int flags = 0;
5704 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5705 // skip all tensors in the NextN layers
5706 flags |= TENSOR_SKIP;
5707 }
5708
5709 auto & layer = layers[i];
5710 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, flags);
5711 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, flags);
5712 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, flags);
5713 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
5714
5715 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
5716
5717 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5718 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
5719 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
5720
5721 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5722
5723 // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
5724 if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
5725 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
5726 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
5727 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
5728 } else {
5729 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
5730 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
5731
5732 if (n_expert == 0) {
5733 throw std::runtime_error("n_expert must be > 0");
5734 }
5735 if (n_expert_used == 0) {
5736 throw std::runtime_error("n_expert_used must be > 0");
5737 }
5738
5739 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
5740 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
5741 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
5742
5743 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5744 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
5745 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5746 }
5747
5748 // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5749 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5750 layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
5751 layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
5752 layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
5753
5754 layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
5755 layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
5756 layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
5757 }
5758 }
5759 } break;
5760 case LLM_ARCH_RWKV6:
5761 {
5762 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5763
5764 // Block 0, LN0
5765 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5766 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
5767
5768 // output
5769 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5770 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5771 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5772
5773 const int time_mix_extra_dim = hparams.time_mix_extra_dim;
5774 const int time_decay_extra_dim = hparams.time_decay_extra_dim;
5775 const int head_size = hparams.wkv_head_size;
5776 const int attn_hidden_size = n_embd;
5777 const int ffn_size = hparams.n_ff_arr[0];
5778
5779 for (int i = 0; i < n_layer; ++i) {
5780 auto & layer = layers[i];
5781
5782 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5783 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5784
5785 layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5786 layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
5787
5788 layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
5789 layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
5790
5791 layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
5792 layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5793 layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5794 layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5795 layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5796 layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5797 layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
5798 GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
5799
5800 layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
5801 layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
5802 layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
5803 layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
5804 layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5805 layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5806 layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5807 layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
5808
5809 layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
5810 layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
5811 layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5812
5813 layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
5814 layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
5815
5816 layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
5817 layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
5818 layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
5819 }
5820
5821 } break;
5822 case LLM_ARCH_RWKV6QWEN2:
5823 {
5824 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5825
5826 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5827 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
5828 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5829
5830 const int time_mix_extra_dim = hparams.time_mix_extra_dim;
5831 const int time_decay_extra_dim = hparams.time_decay_extra_dim;
5832 const int head_size = hparams.wkv_head_size;
5833 const int attn_hidden_size = n_embd;
5834 const int n_head_kv = hparams.n_head_kv();
5835 int attn_key_value_size;
5836 if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
5837 attn_key_value_size = attn_hidden_size;
5838 } else {
5839 attn_key_value_size = n_head_kv * head_size;
5840 }
5841
5842 for (int i = 0; i < n_layer; ++i) {
5843 auto & layer = layers[i];
5844
5845 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5846
5847 layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
5848 layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
5849
5850 layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
5851 layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
5852
5853 layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
5854 layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
5855 layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
5856 layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
5857 layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
5858 layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
5859 layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5860 layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
5861 // optional bias tensors
5862 layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
5863 layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
5864 layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
5865
5866 layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5867
5868 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5869
5870 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5871 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5872 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5873 }
5874 } break;
5875 case LLM_ARCH_RWKV7:
5876 {
5877 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5878
5879 // Block 0, LN0
5880 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5881 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
5882
5883 // output
5884 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5885 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5886 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5887
5888 const int n_lora_decay = hparams.n_lora_decay;
5889 const int n_lora_iclr = hparams.n_lora_iclr;
5890 const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
5891 const int n_lora_gate = hparams.n_lora_gate;
5892 const int attn_hidden_size = n_embd;
5893 const int ffn_size = hparams.n_ff_arr[0];
5894
5895 for (int i = 0; i < n_layer; ++i) {
5896 auto & layer = layers[i];
5897
5898 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5899 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5900
5901 layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5902 layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
5903
5904 layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
5905 layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
5906 layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
5907
5908 layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
5909 layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
5910 layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
5911
5912 if (i == 0) {
5913 // actually not used
5914 layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5915 layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
5916 layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
5917 } else {
5918 layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5919 layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
5920 layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
5921 }
5922
5923 layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
5924 layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
5925
5926 layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
5927
5928 layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
5929 layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
5930 layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
5931
5932 layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5933 layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5934 layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5935
5936 layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
5937 layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
5938 layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5939
5940 layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
5941
5942 layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
5943 layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
5944 }
5945
5946 } break;
5947 case LLM_ARCH_ARWKV7:
5948 {
5949 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5950
5951 // output
5952 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5953 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5954
5955 const int n_lora_decay = hparams.n_lora_decay;
5956 const int n_lora_iclr = hparams.n_lora_iclr;
5957 const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
5958 const int n_lora_gate = hparams.n_lora_gate;
5959 const int attn_hidden_size = n_embd;
5960
5961 for (int i = 0; i < n_layer; ++i) {
5962 auto & layer = layers[i];
5963
5964 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5965
5966 layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
5967 layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
5968 layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
5969
5970 layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
5971 layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
5972 layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
5973
5974 if (i == 0) {
5975 // actually not used
5976 layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5977 layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
5978 layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
5979 } else {
5980 layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5981 layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
5982 layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
5983 }
5984
5985 layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
5986 layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
5987
5988 try {
5989 layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
5990 } catch(std::runtime_error & e) {
5991 // ARWKV models may not have gate tensors
5992 layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
5993 }
5994
5995 layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
5996 layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
5997 layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
5998
5999 layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
6000 layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
6001 layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
6002
6003 layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
6004 layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
6005 layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
6006
6007 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6008
6009 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6010 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6011 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6012 }
6013
6014 } break;
6015 case LLM_ARCH_CHAMELEON:
6016 {
6017 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6018
6019 // output
6020 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6021 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6022 // if output is NULL, init from the input tok embed
6023 if (output == NULL) {
6024 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6025 }
6026
6027 for (int i = 0; i < n_layer; ++i) {
6028 auto & layer = layers[i];
6029
6030 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6031 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
6032 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
6033 layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
6034 layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
6035
6036 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
6037 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
6038 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
6039 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
6040
6041 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6042
6043 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6044 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6045 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6046 }
6047 } break;
6048 case LLM_ARCH_WAVTOKENIZER_DEC:
6049 {
6050 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd, n_vocab}, 0);
6051
6052 conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd, hparams.posnet.n_embd}, 0);
6053 conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
6054
6055 // posnet
6056 {
6057 const int64_t n_embd = hparams.posnet.n_embd;
6058
6059 for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
6060 auto & layer = layers[i].posnet;
6061
6062 // posnet:
6063 //
6064 // - resnet
6065 // - resnet
6066 // - attn
6067 // - resnet
6068 // - resnet
6069 // - norm
6070 //
6071 switch (i) {
6072 case 0:
6073 case 1:
6074 case 3:
6075 case 4:
6076 {
6077 layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
6078 layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
6079
6080 layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
6081 layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
6082
6083 layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
6084 layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
6085
6086 layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
6087 layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
6088 } break;
6089 case 2:
6090 {
6091 layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
6092 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
6093
6094 layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
6095 layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
6096
6097 layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
6098 layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
6099
6100 layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
6101 layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
6102
6103 layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
6104 layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
6105 } break;
6106 case 5:
6107 {
6108 layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
6109 layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
6110 } break;
6111 default: GGML_ABORT("unknown posnet layer");
6112 };
6113 }
6114 }
6115
6116 GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
6117
6118 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
6119 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
6120
6121 // convnext
6122 {
6123 const int64_t n_embd = hparams.convnext.n_embd;
6124
6125 for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
6126 auto & layer = layers[i].convnext;
6127
6128 layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
6129 layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
6130
6131 layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
6132 layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
6133
6134 layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
6135 layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
6136
6137 layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
6138 layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
6139
6140 layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
6141 }
6142
6143 // output
6144 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6145 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
6146 }
6147
6148 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, hparams.n_embd_out()}, 0);
6149 output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {hparams.n_embd_out()}, 0);
6150 } break;
6151 case LLM_ARCH_BAILINGMOE:
6152 {
6153 const int64_t n_ff_exp = hparams.n_ff_exp;
6154 const int64_t n_expert_shared = hparams.n_expert_shared;
6155
6156 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6157
6158 // output
6159 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6160 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6161
6162 for (int i = 0; i < n_layer; ++i) {
6163 auto & layer = layers[i];
6164
6165 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6166
6167 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
6168 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6169 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6170 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
6171 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6172
6173 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6174
6175 if (n_expert == 0) {
6176 throw std::runtime_error("n_expert must be > 0");
6177 }
6178 if (n_expert_used == 0) {
6179 throw std::runtime_error("n_expert_used must be > 0");
6180 }
6181
6182 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6183 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
6184 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6185
6186 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6187 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
6188 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6189 }
6190 } break;
6191 case LLM_ARCH_BAILINGMOE2:
6192 {
6193 const int64_t n_ff_exp = hparams.n_ff_exp;
6194 const int64_t n_expert_shared = hparams.n_expert_shared;
6195
6196 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6197
6198 // output
6199 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6200 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6201
6202 GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
6203 GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
6204
6205 for (int i = 0; i < n_layer; ++i) {
6206 int flags = 0;
6207 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
6208 // skip all tensors in the NextN layers
6209 flags |= TENSOR_SKIP;
6210 }
6211
6212 auto & layer = layers[i];
6213
6214 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
6215
6216 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
6217 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
6218
6219 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
6220 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
6221
6222 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
6223
6224 if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
6225 const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
6226
6227 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
6228 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
6229
6230 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
6231 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
6232 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
6233
6234 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
6235 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
6236 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
6237 } else { // Dense layers
6238 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
6239 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
6240 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
6241 }
6242
6243 // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
6244 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
6245 layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
6246 layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
6247 layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
6248 layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
6249 layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
6250 layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
6251 layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
6252 }
6253 }
6254 } break;
6255 case LLM_ARCH_DOTS1:
6256 {
6257 const int64_t n_ff_exp = hparams.n_ff_exp;
6258 const int64_t n_expert_shared = hparams.n_expert_shared;
6259
6260 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6261
6262 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6263 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6264
6265 for (int i = 0; i < n_layer; ++i) {
6266 auto & layer = layers[i];
6267
6268 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6269
6270 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6271 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6272 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6273 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6274
6275 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6276 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6277
6278 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6279
6280 if (i < (int) hparams.n_layer_dense_lead) {
6281 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6282 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6283 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6284 } else {
6285 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6286 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
6287
6288 if (n_expert == 0) {
6289 throw std::runtime_error("n_expert must be > 0");
6290 }
6291 if (n_expert_used == 0) {
6292 throw std::runtime_error("n_expert_used must be > 0");
6293 }
6294
6295 // MoE branch
6296 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6297 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
6298 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6299
6300 // Shared expert branch
6301 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6302 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
6303 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6304 }
6305 }
6306 } break;
6307 case LLM_ARCH_ARCEE:
6308 {
6309 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6310
6311 // output
6312 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6313 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6314
6315 // if output is NULL, init from the input tok embed
6316 if (output == NULL) {
6317 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6318 }
6319
6320 for (int i = 0; i < n_layer; ++i) {
6321 auto & layer = layers[i];
6322
6323 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6324
6325 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6326 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
6327 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
6328 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6329
6330 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6331
6332 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6333
6334 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6335 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6336 }
6337 } break;
6338 case LLM_ARCH_AFMOE:
6339 {
6340 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6341
6342 // output
6343 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6344 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6345
6346 // if output is NULL, init from the input tok embed
6347 if (output == NULL) {
6348 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6349 }
6350
6351 const int64_t n_ff_exp = hparams.n_ff_exp;
6352 const int64_t n_expert_shared = hparams.n_expert_shared;
6353
6354 for (int i = 0; i < n_layer; ++i) {
6355 auto & layer = layers[i];
6356
6357 // dual attention normalization
6358 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6359 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
6360
6361 // attention projections
6362 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6363 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
6364 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
6365 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6366
6367 // Q/K normalization
6368 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6369 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6370
6371 // attention gating
6372 layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6373
6374 // dual ffn normalization
6375 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6376 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
6377
6378 if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
6379 // MoE layers
6380 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6381 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6382
6383 // grouped expert weights
6384 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
6385 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
6386 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
6387
6388 // shared expert
6389 if (n_expert_shared > 0) {
6390 const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
6391 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
6392 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
6393 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
6394 }
6395 } else {
6396 // Dense layers
6397 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6398 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
6399 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6400 }
6401 }
6402 } break;
6403 case LLM_ARCH_ERNIE4_5:
6404 case LLM_ARCH_ERNIE4_5_MOE:
6405 {
6406 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6407
6408 // output
6409 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6410 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6411 // if output is NULL, init from the input tok embed
6412 if (output == NULL) {
6413 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6414 }
6415
6416 for (int i = 0; i < n_layer; ++i) {
6417 auto & layer = layers[i];
6418
6419 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6420
6421 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6422 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
6423 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
6424 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6425
6426 // optional bias tensors
6427 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
6428 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
6429 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
6430 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
6431
6432 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6433
6434 if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
6435 int n_ff_exp = hparams.n_ff_exp;
6436
6437 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6438 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
6439 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
6440 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
6441 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
6442
6443 // Shared expert (if present)
6444 if (hparams.n_ff_shexp > 0) {
6445 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
6446 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
6447 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
6448 }
6449 } else { // Dense layers
6450 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6451 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6452 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6453 }
6454 }
6455 } break;
6456 case LLM_ARCH_FALCON_H1:
6457 {
6458 // Common
6459 const int64_t hidden_size = hparams.n_embd; // hidden_size
6460
6461 // mamba2 Mixer SSM params
6462 const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
6463 const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
6464 const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
6465 const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
6466 const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
6467 const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
6468 const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
6469
6470 // attn params
6471 const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
6472 const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
6473
6474 // ffn params
6475 const int64_t ffn_intermediate_size = hparams.n_ff(0);
6476
6477 // embeddings
6478 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
6479
6480 // output
6481 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
6482 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
6483
6484 // if output is NULL, init from the input tok embed
6485 if (output == NULL) {
6486 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
6487 }
6488
6489 for (int i = 0; i < n_layer; ++i) {
6490 auto & layer = layers[i];
6491
6492 /*SSM LAYERS*/
6493 // ssm in
6494 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
6495 // ssm 1d conv
6496 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
6497 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
6498 // ssm_dt
6499 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
6500 // no "weight" suffix for these
6501 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
6502 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
6503 // ssm_norm
6504 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
6505 // out_proj
6506 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
6507
6508 /*ATTENTION LAYERS*/
6509 // attention layers (with optional bias)
6510 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
6511 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
6512 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
6513 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
6514 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
6515 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
6516 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
6517 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
6518 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
6519
6520
6521 // feed forward (w/ optional biases)
6522 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
6523 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6524 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
6525 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
6526 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
6527
6528 layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
6529 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
6530 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
6531 }
6532 } break;
6533 case LLM_ARCH_HUNYUAN_MOE:
6534 {
6535 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6536
6537 // output
6538 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6539 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6540 // if output is NULL, init from the input tok embed
6541 if (output == NULL) {
6542 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6543 }
6544
6545 for (int i = 0; i < n_layer; ++i) {
6546 auto & layer = layers[i];
6547
6548 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6549
6550 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6551 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
6552 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
6553 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6554
6555 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6556 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6557
6558 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6559
6560 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6561 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
6562 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
6563 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
6564
6565 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
6566 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
6567 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
6568 }
6569 } break;
6570 case LLM_ARCH_HUNYUAN_DENSE:
6571 {
6572 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6573
6574 // output
6575 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6576 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6577 // if output is NULL, init from the input tok embed
6578 if (output == NULL) {
6579 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6580 }
6581
6582 for (int i = 0; i < n_layer; ++i) {
6583 auto & layer = layers[i];
6584
6585 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6586
6587 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6588 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
6589 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
6590 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6591
6592 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6593 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6594
6595 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6596
6597 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6598 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6599 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6600
6601 }
6602 } break;
6603 case LLM_ARCH_SMOLLM3:
6604 {
6605 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6606
6607 // output
6608 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6609 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6610
6611 // if output is NULL, init from the input tok embed
6612 if (output == NULL) {
6613 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6614 }
6615
6616 for (int i = 0; i < n_layer; ++i) {
6617 auto & layer = layers[i];
6618
6619 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6620
6621 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6622 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
6623 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
6624 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6625
6626 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6627 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6628 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6629 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6630 }
6631 } break;
6632 case LLM_ARCH_OPENAI_MOE:
6633 {
6634 const int64_t n_ff_exp = hparams.n_ff_exp;
6635
6636 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6637
6638 // output
6639 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6640 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6641
6642 for (int i = 0; i < n_layer; ++i) {
6643 auto & layer = layers[i];
6644
6645 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6646 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
6647
6648 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
6649 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6650 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6651 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
6652
6653 layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
6654
6655 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
6656 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6657 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
6658 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6659
6660 // bias
6661 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
6662 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
6663 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
6664 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
6665
6666 layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
6667 layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
6668 layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
6669 layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
6670 }
6671 } break;
6672 case LLM_ARCH_LFM2:
6673 case LLM_ARCH_LFM2MOE:
6674 {
6675 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6676
6677 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
6678 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6679
6680 if (output == NULL) {
6681 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6682 }
6683
6684 for (int i = 0; i < n_layer; ++i) {
6685 auto & layer = layers[i];
6686
6687 const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
6688
6689 // ffn/moe is same for transformer and conv layers
6690 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6691 if (is_moe_layer) {
6692 GGML_ASSERT(n_expert && n_expert_used);
6693 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6694 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
6695 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp, n_embd, n_expert}, 0);
6696 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
6697 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6698 } else { // dense
6699 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6700 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6701 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6702 }
6703
6704 // for operator_norm
6705 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6706
6707 if (!hparams.is_recurrent(i)) {
6708 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6709 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6710 GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
6711
6712 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
6713 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
6714 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
6715
6716 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
6717 } else {
6718 layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
6719 layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
6720 layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
6721 }
6722 }
6723
6724 // for LFM2-ColBert-350M
6725 dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
6726 } break;
6727 case LLM_ARCH_SMALLTHINKER:
6728 {
6729 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6730
6731 // output
6732 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6733 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6734
6735 // if output is NULL, init from the input tok embed
6736 if (output == NULL) {
6737 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6738 }
6739
6740 for (int i = 0; i < n_layer; ++i) {
6741 auto & layer = layers[i];
6742
6743 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6744
6745 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6746 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6747 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6748 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6749
6750 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6751
6752 GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
6753 GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
6754
6755 // MoE branch
6756 const int64_t n_ff_exp = hparams.n_ff_exp;
6757 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
6758 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6759 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
6760 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6761 }
6762 } break;
6763 case LLM_ARCH_GROVEMOE:
6764 {
6765 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6766
6767 // output
6768 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6769 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6770 // if output is NULL, init from the input tok embed
6771 if (output == NULL) {
6772 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6773 }
6774
6775 GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
6776 GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
6777 GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
6778
6779 for (int i = 0; i < n_layer; ++i) {
6780 auto & layer = layers[i];
6781
6782 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6783
6784 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6785 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
6786 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
6787 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6788
6789 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6790 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6791
6792 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6793
6794 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6795
6796 // MoE branch
6797 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
6798 const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
6799 const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
6800
6801 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6802 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
6803 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6804
6805 layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
6806 layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
6807 layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
6808 }
6809 } break;
6810 case LLM_ARCH_APERTUS:
6811 {
6812 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6813
6814 // output
6815 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6816 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
6817
6818 for (int i = 0; i < n_layer; ++i) {
6819 auto & layer = layers[i];
6820
6821 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6822
6823 if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
6824 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6825 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6826 } else {
6827 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6828 }
6829
6830 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6831 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6832 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6833 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6834
6835 // optional bias tensors
6836 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
6837 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6838 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6839 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
6840
6841 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6842 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
6843 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
6844
6845 // Q and K layernorms for Apertus
6846 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6847 layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6848 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6849 layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6850 }
6851 } break;
6852 case LLM_ARCH_MINIMAX_M2:
6853 {
6854 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6855
6856 // output
6857 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6858 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6859
6860 for (int i = 0; i < n_layer; ++i) {
6861 auto & layer = layers[i];
6862
6863 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6864 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6865 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6866 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6867
6868 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6869 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
6870 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
6871
6872 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6873
6874 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6875 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
6876 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
6877 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
6878 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6879 }
6880 } break;
6881 case LLM_ARCH_KIMI_LINEAR:
6882 {
6883 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6884
6885 // output
6886 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6887 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6888
6889 for (int i = 0; i < n_layer; ++i) {
6890 auto & layer = layers[i];
6891
6892 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6893
6894 // Check for KDA specific tensors to determine layer type or if it's a mixed model
6895 // Assuming KDA layer if KDA tensors are present
6896
6897 // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
6898 const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda;
6899 const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
6900 const int64_t ssm_d_conv = hparams.ssm_d_conv;
6901
6902 // Try loading KDA specific tensors (using SSM_ prefix)
6903 // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
6904 // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
6905 layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
6906 if (!layer.ssm_q_conv) {
6907 layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
6908 }
6909
6910 if (layer.ssm_q_conv) {
6911 // KDA Layer - Conv1d weights may be 3D or 4D
6912 layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
6913 if (!layer.ssm_k_conv) {
6914 layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
6915 }
6916 layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
6917 if (!layer.ssm_v_conv) {
6918 layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0);
6919 }
6920
6921 // q, k, v projections
6922 // Python: q_proj, k_proj, v_proj
6923 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
6924 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
6925 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_v_kda * n_head}, 0);
6926
6927 // KDA specific projections
6928 // f_a_proj, f_b_proj
6929 layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim
6930 layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size
6931
6932 // b_proj (beta mixing coefficient)
6933 layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0);
6934
6935 // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py
6936 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
6937 if (!layer.ssm_a) {
6938 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
6939 }
6940
6941 // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096]
6942 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0);
6943
6944 // g_a_proj, g_b_proj (output gate)
6945 layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0);
6946 layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0);
6947
6948 // o_norm (reusing SSM_NORM)
6949 layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated
6950
6951 // o_proj
6952 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0);
6953
6954 } else {
6955 // MLA Layer - use MLA-specific head dimensions
6956 const int64_t q_lora_rank = hparams.n_lora_q;
6957 const int64_t kv_lora_rank = hparams.n_lora_kv;
6958 const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
6959 const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
6960
6961 layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
6962 layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
6963
6964 if (layer.attn_q_a_norm) {
6965 layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
6966 layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
6967 } else {
6968 // Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla]
6969 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
6970 }
6971
6972 // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
6973 // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
6974 const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim
6975 layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
6976 // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
6977 layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED);
6978 if (!layer.wkv_b) { // MLA KV cache enabled
6979 layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
6980 layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
6981 }
6982 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
6983 }
6984
6985 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6986
6987 // MoE intermediate size (different from dense FFN)
6988 const int64_t n_ff_exp = hparams.n_ff_exp;
6989
6990 // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
6991 // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
6992 if (i < (int) hparams.n_layer_dense_lead) {
6993 // Dense FFN layer - use normal n_ff
6994 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6995 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
6996 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6997 } else {
6998 // MoE layer - use n_ff_exp (1024) instead of n_ff (9216)
6999 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
7000 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
7001 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
7002 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
7003
7004 // Shared experts use moe_intermediate_size * num_shared_experts
7005 // Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024
7006 // Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd]
7007 const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1);
7008 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
7009 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
7010 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
7011
7012 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
7013 }
7014 }
7015 } break;
7016 case LLM_ARCH_COGVLM:
7017 {
7018 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7019
7020 // output
7021 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7022 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7023
7024 // if output is NULL, init from the input tok embed
7025 if (output == NULL) {
7026 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7027 }
7028
7029 for (int i = 0; i < n_layer; ++i) {
7030 auto & layer = layers[i];
7031
7032 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7033 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
7034 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7035
7036 layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
7037 layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7038
7039 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7040
7041 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7042 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
7043 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
7044 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
7045
7046 layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
7047 layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
7048 layer.visexp_ffn_up = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
7049 }
7050 } break;
7051 case LLM_ARCH_PANGU_EMBED:
7052 {
7053 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7054
7055 // output
7056 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7057 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7058
7059 // if output is NULL, init from the input tok embed
7060 if (output == NULL) {
7061 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7062 }
7063
7064 for (int i = 0; i < n_layer; ++i) {
7065 auto & layer = layers[i];
7066
7067 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7068
7069 // weight tensors
7070 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
7071 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
7072 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
7073 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7074
7075 // bias tensors
7076 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd_head_k * n_head}, 0);
7077 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
7078 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
7079 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
7080
7081 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7082
7083 if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7084 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7085 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7086 } else {
7087 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7088 }
7089
7090 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
7091 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
7092 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
7093 }
7094 } break;
7095 case LLM_ARCH_QWEN3NEXT:
7096 {
7097 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7098
7099 // output
7100 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7101 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7102
7103 // if output is NULL, init from the input tok embed
7104 if (output == NULL) {
7105 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7106 }
7107
7108 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
7109
7110 // Calculate dimensions from hyperparameters
7111 const int64_t head_k_dim = hparams.ssm_d_state;
7112 const int64_t head_v_dim = hparams.ssm_d_state;
7113 const int64_t n_k_heads = hparams.ssm_n_group;
7114 const int64_t n_v_heads = hparams.ssm_dt_rank;
7115 const int64_t key_dim = head_k_dim * n_k_heads;
7116 const int64_t value_dim = head_v_dim * n_v_heads;
7117 const int64_t conv_dim = key_dim * 2 + value_dim;
7118
7119 // Calculate projection sizes
7120 const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
7121 const int64_t ba_dim = n_v_heads * 2;
7122
7123 for (int i = 0; i < n_layer; ++i) {
7124 auto & layer = layers[i];
7125
7126 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
7127 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7128
7129 if (!hparams.is_recurrent(i)) {
7130 // Attention layers
7131 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7132 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
7133 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
7134 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7135
7136 // Q/K normalization for attention layers
7137 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7138 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7139 } else {
7140 // Linear attention (gated delta net) specific tensors
7141 // Create tensors with calculated dimensions
7142 // note: ssm_in is used by legacy GGUF
7143 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
7144 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7145 layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7146 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7147 layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
7148 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
7149 layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
7150 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
7151 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
7152 }
7153
7154 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
7155 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7156 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
7157 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7158
7159 // Shared experts
7160 layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
7161 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
7162 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
7163 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
7164 }
7165 } break;
7166 case LLM_ARCH_QWEN35MOE:
7167 {
7168 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7169
7170 // output
7171 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7172 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7173
7174 // if output is NULL, init from the input tok embed
7175 if (output == NULL) {
7176 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7177 }
7178
7179 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
7180
7181 // Calculate dimensions from hyperparameters
7182 const int64_t head_k_dim = hparams.ssm_d_state;
7183 const int64_t head_v_dim = hparams.ssm_d_state;
7184 const int64_t n_k_heads = hparams.ssm_n_group;
7185 const int64_t n_v_heads = hparams.ssm_dt_rank;
7186 const int64_t key_dim = head_k_dim * n_k_heads;
7187 const int64_t value_dim = head_v_dim * n_v_heads;
7188 const int64_t conv_dim = key_dim * 2 + value_dim;
7189
7190 for (int i = 0; i < n_layer; ++i) {
7191 auto & layer = layers[i];
7192
7193 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
7194 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7195
7196 if (!hparams.is_recurrent(i)) {
7197 // Attention layers
7198 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7199 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
7200 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
7201 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7202
7203 // Q/K normalization for attention layers
7204 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7205 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7206 } else {
7207 // Linear attention (gated delta net) specific tensors
7208 // Create tensors with calculated dimensions
7209 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7210 layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7211 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7212 layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
7213 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
7214 layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
7215 layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
7216 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
7217 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
7218 }
7219
7220 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
7221 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7222 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
7223 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7224
7225 // Shared experts
7226 const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
7227
7228 layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
7229 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
7230 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
7231 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
7232 }
7233 } break;
7234 case LLM_ARCH_QWEN35:
7235 {
7236 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7237
7238 // output
7239 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7240 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7241
7242 // if output is NULL, init from the input tok embed
7243 if (output == NULL) {
7244 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7245 }
7246
7247 // Calculate dimensions from hyperparameters
7248 const int64_t head_k_dim = hparams.ssm_d_state;
7249 const int64_t head_v_dim = hparams.ssm_d_state;
7250 const int64_t n_k_heads = hparams.ssm_n_group;
7251 const int64_t n_v_heads = hparams.ssm_dt_rank;
7252 const int64_t key_dim = head_k_dim * n_k_heads;
7253 const int64_t value_dim = head_v_dim * n_v_heads;
7254 const int64_t conv_dim = key_dim * 2 + value_dim;
7255
7256 for (int i = 0; i < n_layer; ++i) {
7257 auto & layer = layers[i];
7258
7259 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
7260 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7261
7262 if (!hparams.is_recurrent(i)) {
7263 // Attention layers
7264 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7265 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
7266 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
7267 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7268
7269 // Q/K normalization for attention layers
7270 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7271 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7272 } else {
7273 // Linear attention (gated delta net) specific tensors
7274 // Create tensors with calculated dimensions
7275 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7276 layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7277 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7278 layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
7279 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
7280 layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
7281 layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
7282 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
7283 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
7284 }
7285
7286 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
7287 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
7288 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
7289 }
7290 } break;
7291 case LLM_ARCH_MIMO2:
7292 {
7293 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7294
7295 // output
7296 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7297 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
7298
7299 for (int i = 0; i < n_layer; ++i) {
7300 auto & layer = layers[i];
7301 uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
7302 uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
7303 uint32_t n_head = hparams.n_head(i);
7304
7305 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
7306 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
7307 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
7308 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
7309
7310 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7311 layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
7312
7313 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7314
7315 // non-MoE branch
7316 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
7317 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
7318 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
7319
7320 // MoE branch
7321 int64_t n_ff_exp = hparams.n_ff_exp;
7322 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7323 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
7324 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7325 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
7326 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
7327 }
7328 } break;
7329 case LLM_ARCH_STEP35:
7330 {
7331 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7332
7333 // output
7334 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7335 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
7336
7337 // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
7338 // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
7339 uint32_t n_rot_max = 0;
7340 for (int i = 0; i < n_layer; ++i) {
7341 n_rot_max = std::max(n_rot_max, hparams.n_rot);
7342 }
7343 if (n_rot_max == 0) {
7344 n_rot_max = n_rot;
7345 }
7346
7347 for (int i = 0; i < n_layer; ++i) {
7348 auto & layer = layers[i];
7349
7350 const uint32_t n_head_l = hparams.n_head(i);
7351 const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
7352 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
7353
7354 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7355 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
7356 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
7357
7358 // optional rope factors (llama3) / longrope tensors
7359 if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7360 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7361 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7362 } else {
7363 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7364 }
7365
7366 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0);
7367 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
7368 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
7369 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
7370
7371 // head-wise attention gate (Step35 self_attn.g_proj)
7372 layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
7373
7374 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7375
7376 // dense MLP (leading dense blocks)
7377 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
7378 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
7379 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
7380
7381 // MoE routed experts + selection bias (router_bias)
7382 const int64_t n_ff_exp = hparams.n_ff_exp;
7383 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7384 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
7385 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7386 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
7387 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
7388
7389 // shared expert MLP
7390 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
7391 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
7392 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
7393 }
7394 } break;
7395 case LLM_ARCH_MAINCODER:
7396 {
7397 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7398
7399 // output
7400 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7401 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7402 // if output is NULL, init from the input tok embed
7403 if (output == NULL) {
7404 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7405 }
7406
7407 for (int i = 0; i < n_layer; ++i) {
7408 auto & layer = layers[i];
7409
7410 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7411
7412 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
7413 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
7414 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
7415 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7416
7417 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
7418 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
7419
7420 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7421 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
7422 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
7423 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
7424 }
7425 } break;
7426 default:
7427 throw std::runtime_error("unknown architecture");
7428 }
7429
7430 if (n_moved_tensors > 0) {
7431 LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
7432 __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
7433 ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
7434 }
7435 }
7436
7437 ml.done_getting_tensors();
7438
7439 ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
7440 pimpl->mappings.reserve(ml.mappings.size());
7441
7442 // create the backend buffers
7443 std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
7444 ctx_buf_maps.reserve(ctx_map.size());
7445
7446 // Ensure we have enough capacity for the maximum backend buffer we will potentially create
7447 const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
7448 pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
7449
7450 for (auto & [buft, ctx_ptr] : ctx_map) {
7451 ggml_context * ctx = ctx_ptr.get();
7452
7453 // skip contexts without tensors
7454 if (ggml_get_first_tensor(ctx) == nullptr) {
7455 continue;
7456 }
7457
7458 llama_buf_map buf_map;
7459 buf_map.reserve(n_max_backend_buffer);
7460
7461 // check if it is possible to use buffer_from_host_ptr with this buffer type
7462 ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
7463 if (!dev) {
7464 // FIXME: workaround for CPU backend buft having a NULL device
7465 dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
7466 if (!dev) {
7467 throw std::runtime_error(format("%s: no CPU backend found", __func__));
7468 }
7469 }
7470 ggml_backend_dev_props props;
7471 ggml_backend_dev_get_props(dev, &props);
7472 bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
7473 bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
7474
7475 std::vector<ggml_backend_buffer_ptr> bufs;
7476 if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
7477 GGML_ASSERT(!ml.no_alloc);
7478 for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
7479 // only the mmap region containing the tensors in the model is mapped to the backend buffer
7480 // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
7481 // then we could just use metal for all layers
7482 // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
7483 void * addr = nullptr;
7484 size_t first, last; // NOLINT
7485 ml.get_mapping_range(&first, &last, &addr, idx, ctx);
7486 if (first >= last) {
7487 continue;
7488 }
7489 const size_t max_size = ggml_get_max_tensor_size(ctx);
7490 ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
7491 if (buf == nullptr) {
7492 throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
7493 }
7494 bufs.emplace_back(buf);
7495 buf_map.emplace(idx, buf);
7496 }
7497 } else {
7498 ggml_backend_buffer_t buf;
7499 if (ml.no_alloc) {
7500 buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
7501 for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
7502 t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
7503 }
7504 } else {
7505 buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
7506 }
7507 if (buf == nullptr) {
7508 throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
7509 }
7510 if (use_mlock && ggml_backend_buffer_is_host(buf)) {
7511 pimpl->mlock_bufs.emplace_back(new llama_mlock);
7512 auto & mlock_buf = pimpl->mlock_bufs.back();
7513 mlock_buf->init (ggml_backend_buffer_get_base(buf));
7514 mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
7515 }
7516 bufs.emplace_back(buf);
7517 for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
7518 buf_map.emplace(idx, buf);
7519 }
7520 }
7521 pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
7522
7523 for (auto & buf : buf_map) {
7524 // indicate that this buffer contains weights
7525 // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
7526 ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
7527 }
7528
7529 ctx_buf_maps.emplace_back(ctx, buf_map);
7530 }
7531
7532 if (llama_supports_gpu_offload()) {
7533 const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
7534
7535 int n_repeating = n_gpu;
7536 if (n_repeating > 0) {
7537 LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
7538 n_repeating--;
7539 }
7540 LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
7541
7542 const int max_backend_supported_layers = hparams.n_layer + 1;
7543 const int max_offloadable_layers = hparams.n_layer + 1;
7544
7545 LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
7546 }
7547
7548 // print memory requirements per buffer type
7549 for (auto & [_, bufs] : pimpl->ctxs_bufs) {
7550 for (auto & buf: bufs) {
7551 LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
7552 __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
7553 }
7554 }
7555
7556 // populate tensors_by_name
7557 for (auto & [ctx, _] : pimpl->ctxs_bufs) {
7558 for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
7559 tensors_by_name.emplace_back(ggml_get_name(cur), cur);
7560 }
7561 }
7562
7563 if (ml.no_alloc) {
7564 return true;
7565 }
7566
7567 // load tensor data
7568 for (auto & [ctx, buf_map] : ctx_buf_maps) {
7569 if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
7570 return false;
7571 }
7572 }
7573
7574 if (use_mmap_buffer) {
7575 for (auto & mapping : ml.mappings) {
7576 pimpl->mappings.emplace_back(std::move(mapping));
7577 }
7578 }
7579
7580 return true;
7581}
7582
7583std::string llama_model::arch_name() const {
7584 return llm_arch_name(arch);
7585}
7586
7587std::string llama_model::type_name() const {
7588 return llm_type_name(type);
7589}
7590
7591std::string llama_model::desc() const {
7592 return pimpl->desc_str;
7593}
7594
7595size_t llama_model::size() const {
7596 return pimpl->n_bytes;
7597}
7598
7599size_t llama_model::n_tensors() const {
7600 return tensors_by_name.size();
7601}
7602
7603size_t llama_model::n_devices() const {
7604 return devices.size();
7605}
7606
7607uint32_t llama_model::n_gpu_layers() const {
7608 return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
7609}
7610
7611llama_split_mode llama_model::split_mode() const {
7612 return params.split_mode;
7613}
7614
7615std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
7616 std::map<ggml_backend_buffer_type_t, size_t> ret;
7617 for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
7618 if (hparams.no_alloc) {
7619 GGML_ASSERT(bufs.size() == 1);
7620 ggml_backend_buffer_t buf = bufs[0].get();
7621 GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
7622 ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
7623 ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
7624 } else {
7625 for (const auto & buf : bufs) {
7626 // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
7627 ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
7628 }
7629 }
7630 }
7631 return ret;
7632}
7633
7634uint64_t llama_model::n_elements() const {
7635 return pimpl->n_elements;
7636}
7637
7638void llama_model::print_info() const {
7639 const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
7640
7641 auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
7642 bool is_var = false;
7643
7644 std::vector<uint32_t> v;
7645 for (uint32_t i = 0; i < n; ++i) {
7646 v.push_back(f(i));
7647 if (v[i] != v[0]) {
7648 is_var = true;
7649 }
7650 }
7651
7652 std::stringstream ss;
7653
7654 if (is_var) {
7655 ss << "[";
7656 for (uint32_t i = 0; i < n; ++i) {
7657 ss << v[i];
7658 if (i < n - 1) {
7659 ss << ", ";
7660 }
7661 }
7662 ss << "]";
7663 } else {
7664 ss << v[0];
7665 }
7666
7667 return ss.str();
7668 };
7669
7670 // hparams
7671 LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
7672 LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
7673 LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
7674
7675 if (!hparams.vocab_only) {
7676 LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
7677 LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
7678 LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
7679 LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
7680 LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
7681 LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
7682 LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
7683 LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
7684 LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
7685 LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
7686 LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
7687 LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
7688 LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
7689 LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
7690 LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
7691 LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
7692 LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
7693 LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
7694 LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
7695 LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
7696 LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
7697 LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
7698 LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
7699 LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
7700 LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
7701 LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
7702 LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
7703 LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
7704 LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
7705 LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
7706 LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7707 if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7708 LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7709 LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7710 }
7711 LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
7712 LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
7713 LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
7714 // MRoPE (Multi-axis Rotary Position Embedding) sections
7715 if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
7716 LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
7717 }
7718 if (!classifier_labels.empty()) {
7719 LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
7720
7721 size_t i = 0;
7722 for (auto label : classifier_labels) {
7723 LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
7724 }
7725 }
7726 }
7727
7728 if (arch == LLM_ARCH_MAMBA ||
7729 arch == LLM_ARCH_MAMBA2 ||
7730 arch == LLM_ARCH_JAMBA ||
7731 arch == LLM_ARCH_FALCON_H1 ||
7732 arch == LLM_ARCH_PLAMO2 ||
7733 arch == LLM_ARCH_GRANITE_HYBRID ||
7734 arch == LLM_ARCH_QWEN3NEXT ||
7735 arch == LLM_ARCH_QWEN35 ||
7736 arch == LLM_ARCH_QWEN35MOE ||
7737 arch == LLM_ARCH_NEMOTRON_H ||
7738 arch == LLM_ARCH_NEMOTRON_H_MOE) {
7739 LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
7740 LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
7741 LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
7742 LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
7743 LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
7744 LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
7745 }
7746
7747 LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
7748 if (pimpl->n_elements >= 1e12) {
7749 LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
7750 } else if (pimpl->n_elements >= 1e9) {
7751 LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
7752 } else if (pimpl->n_elements >= 1e6) {
7753 LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
7754 } else {
7755 LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
7756 }
7757
7758 // general kv
7759 LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
7760
7761 if (arch == LLM_ARCH_DEEPSEEK) {
7762 LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7763 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7764 LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7765 LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7766 }
7767
7768 if (arch == LLM_ARCH_DEEPSEEK2) {
7769 LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7770 LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
7771 LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
7772 LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
7773 LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
7774 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7775 LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7776 LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7777 LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7778 LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7779 }
7780
7781 if (arch == LLM_ARCH_QWEN2MOE) {
7782 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7783 LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7784 }
7785
7786 if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
7787 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7788 }
7789
7790 if (arch == LLM_ARCH_MINICPM ||
7791 arch == LLM_ARCH_GRANITE ||
7792 arch == LLM_ARCH_GRANITE_MOE ||
7793 arch == LLM_ARCH_GRANITE_HYBRID ||
7794 arch == LLM_ARCH_NEMOTRON_H_MOE) {
7795 LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7796 LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7797 LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
7798 LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7799 }
7800
7801 if (arch == LLM_ARCH_BAILINGMOE) {
7802 LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7803 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7804 LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7805 LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7806 LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7807 }
7808
7809 if (arch == LLM_ARCH_BAILINGMOE2) {
7810 LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7811 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7812 LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7813 LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7814 LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7815 LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7816 LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7817 LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
7818 }
7819
7820 if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
7821 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7822 LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7823 }
7824
7825 if (arch == LLM_ARCH_GROVEMOE) {
7826 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7827 LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
7828 LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
7829 LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
7830 }
7831
7832 vocab.print_info();
7833}
7834
7835ggml_backend_dev_t llama_model::dev_layer(int il) const {
7836 return pimpl->dev_layer.at(il).dev;
7837}
7838
7839ggml_backend_dev_t llama_model::dev_output() const {
7840 return pimpl->dev_output.dev;
7841}
7842
7843template<typename F>
7844static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
7845 ggml_init_params params = {
7846 /*.mem_size =*/ ggml_tensor_overhead()*8,
7847 /*.mem_buffer =*/ NULL,
7848 /*.no_alloc =*/ true,
7849 };
7850
7851 ggml_context_ptr ctx { ggml_init(params) };
7852 if (!ctx) {
7853 throw std::runtime_error(format("failed to create ggml context"));
7854 }
7855
7856 ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
7857 ggml_tensor * op_tensor = fn(ctx.get());
7858 for (int i = 0; i < GGML_MAX_SRC; i++) {
7859 if (op_tensor->src[i] != nullptr) {
7860 assert(op_tensor->src[i]->buffer == nullptr);
7861 op_tensor->src[i]->buffer = buf.get();
7862 }
7863 }
7864
7865 bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
7866
7867 return op_supported;
7868}
7869
7870template<typename F>
7871static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
7872 for (const auto & cur : buft_list) {
7873 ggml_backend_dev_t cur_dev = cur.first;
7874 ggml_backend_buffer_type_t cur_buft = cur.second;
7875 if (buft_supported(cur_buft, cur_dev, fn)) {
7876 return cur_buft;
7877 }
7878 }
7879
7880 throw std::runtime_error(format("no suitable buffer type found"));
7881}
7882
7883ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
7884 return ::select_buft(
7885 *pimpl->dev_layer.at(il).buft_list,
7886 [&](ggml_context * ctx) {
7887 ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
7888 ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
7889 return ggml_add(ctx, cur, layer_dir);
7890 });
7891}
7892
7893bool llama_model::has_tensor_overrides() const {
7894 return pimpl->has_tensor_overrides;
7895}
7896
7897const ggml_tensor * llama_model::get_tensor(const char * name) const {
7898 auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
7899 [name](const std::pair<std::string, ggml_tensor *> & it) {
7900 return it.first == name;
7901 });
7902 if (it == tensors_by_name.end()) {
7903 return nullptr;
7904 }
7905
7906 return it->second;
7907}
7908
7909float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
7910 return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
7911}
7912
7913float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
7914 return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
7915}
7916
7917ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
7918 const uint32_t n_ctx_seq = cparams.n_ctx_seq;
7919
7920 // choose long/short freq factors based on the context size
7921 if (layers[il].rope_freqs != nullptr) {
7922 return layers[il].rope_freqs;
7923 }
7924
7925 if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
7926 return layers[il].rope_long;
7927 }
7928
7929 return layers[il].rope_short;
7930}
7931
7932llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
7933 llama_memory_i * res;
7934
7935 switch (arch) {
7936 // Models that need specific instantiation should be handled in the
7937 // switch statement
7938 case LLM_ARCH_BERT:
7939 case LLM_ARCH_JINA_BERT_V2:
7940 case LLM_ARCH_JINA_BERT_V3:
7941 case LLM_ARCH_NOMIC_BERT:
7942 case LLM_ARCH_NOMIC_BERT_MOE:
7943 case LLM_ARCH_NEO_BERT:
7944 case LLM_ARCH_WAVTOKENIZER_DEC:
7945 case LLM_ARCH_MODERN_BERT:
7946 case LLM_ARCH_GEMMA_EMBEDDING:
7947 case LLM_ARCH_DREAM:
7948 case LLM_ARCH_LLADA:
7949 case LLM_ARCH_LLADA_MOE:
7950 case LLM_ARCH_RND1:
7951 {
7952 res = nullptr;
7953 } break;
7954 // Models that need standard caching should rely on recurrent/hybrid
7955 // checks
7956 default:
7957 {
7958 if (llm_arch_is_recurrent(arch)) {
7959 res = new llama_memory_recurrent(
7960 *this,
7961 GGML_TYPE_F32,
7962 GGML_TYPE_F32,
7963 cparams.offload_kqv,
7964 std::max((uint32_t) 1, cparams.n_seq_max),
7965 cparams.n_seq_max,
7966 nullptr);
7967 } else if (llm_arch_is_hybrid(arch)) {
7968
7969 // The main difference between hybrid architectures is the
7970 // layer filters, so pick the right one here
7971 llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
7972 llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
7973 if (arch == LLM_ARCH_FALCON_H1) {
7974 filter_attn = [&](int32_t) { return true; };
7975 filter_recr = [&](int32_t) { return true; };
7976 } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
7977 filter_attn = [&](int32_t il) {
7978 return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
7979 };
7980 filter_recr = [&](int32_t il) {
7981 return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
7982 };
7983 }
7984
7985 if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7986 // Use hybrid-iswa for hybrid models with SWA
7987 res = new llama_memory_hybrid_iswa(
7988 /* model */ *this,
7989 /* attn_type_k */ params.type_k,
7990 /* attn_type_v */ params.type_v,
7991 /* attn_v_trans */ !cparams.flash_attn,
7992 /* attn_swa_full */ params.swa_full,
7993 /* attn_kv_size */ cparams.n_ctx,
7994 /* attn_n_ubatch */ cparams.n_ubatch,
7995 /* attn_n_pad */ 1,
7996 /* recurrent_type_r */ GGML_TYPE_F32,
7997 /* recurrent_type_s */ GGML_TYPE_F32,
7998 /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
7999 /* n_seq_max */ cparams.n_seq_max,
8000 /* offload */ cparams.offload_kqv,
8001 /* unified */ cparams.kv_unified,
8002 /* filter_attn */ std::move(filter_attn),
8003 /* filter_recr */ std::move(filter_recr));
8004 } else {
8005 res = new llama_memory_hybrid(
8006 /* model */ *this,
8007 /* attn_type_k */ params.type_k,
8008 /* attn_type_v */ params.type_v,
8009 /* attn_v_trans */ !cparams.flash_attn,
8010 /* attn_kv_size */ cparams.n_ctx,
8011 /* attn_n_pad */ 1,
8012 /* attn_n_swa */ hparams.n_swa,
8013 /* attn_swa_type */ hparams.swa_type,
8014 /* recurrent_type_k */ GGML_TYPE_F32,
8015 /* recurrent_type_v */ GGML_TYPE_F32,
8016 /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
8017 /* n_seq_max */ cparams.n_seq_max,
8018 /* offload */ cparams.offload_kqv,
8019 /* unified */ cparams.kv_unified,
8020 /* filter_attn */ std::move(filter_attn),
8021 /* filter_recr */ std::move(filter_recr));
8022 }
8023 } else {
8024 llama_memory_i::layer_reuse_cb reuse = nullptr;
8025
8026 if (arch == LLM_ARCH_GEMMA3N) {
8027 reuse = [&](int32_t il) {
8028 if (il >= (int32_t) hparams.n_layer_kv_from_start) {
8029 return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
8030 }
8031
8032 return -1;
8033 };
8034 }
8035
8036 if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8037 GGML_ASSERT(hparams.is_swa_any());
8038
8039 res = new llama_kv_cache_iswa(
8040 *this,
8041 params.type_k,
8042 params.type_v,
8043 !cparams.flash_attn,
8044 cparams.offload_kqv,
8045 params.swa_full,
8046 cparams.kv_unified,
8047 cparams.n_ctx_seq,
8048 cparams.n_seq_max,
8049 cparams.n_ubatch,
8050 1,
8051 nullptr,
8052 reuse);
8053 } else {
8054 GGML_ASSERT(!hparams.is_swa_any());
8055
8056 res = new llama_kv_cache(
8057 *this,
8058 params.type_k,
8059 params.type_v,
8060 !cparams.flash_attn,
8061 cparams.offload_kqv,
8062 cparams.kv_unified,
8063 cparams.n_ctx_seq,
8064 cparams.n_seq_max,
8065 1,
8066 hparams.n_swa,
8067 hparams.swa_type,
8068 nullptr,
8069 nullptr);
8070 }
8071 }
8072 }
8073 }
8074
8075 return res;
8076}
8077
8078ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
8079 std::unique_ptr<llm_graph_context> llm;
8080
8081 switch (arch) {
8082 case LLM_ARCH_LLAMA:
8083 {
8084 llm = std::make_unique<llm_build_llama<false>>(*this, params);
8085 } break;
8086 case LLM_ARCH_LLAMA4:
8087 {
8088 if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
8089 llm = std::make_unique<llm_build_llama<false>>(*this, params);
8090 } else {
8091 llm = std::make_unique<llm_build_llama_iswa>(*this, params);
8092 }
8093 } break;
8094 case LLM_ARCH_LLAMA_EMBED:
8095 {
8096 llm = std::make_unique<llm_build_llama<true>>(*this, params);
8097 } break;
8098 case LLM_ARCH_MAINCODER:
8099 {
8100 llm = std::make_unique<llm_build_maincoder>(*this, params);
8101 } break;
8102 case LLM_ARCH_DECI:
8103 {
8104 llm = std::make_unique<llm_build_deci>(*this, params);
8105 } break;
8106 case LLM_ARCH_BAICHUAN:
8107 {
8108 llm = std::make_unique<llm_build_baichuan>(*this, params);
8109 } break;
8110 case LLM_ARCH_FALCON:
8111 {
8112 llm = std::make_unique<llm_build_falcon>(*this, params);
8113 } break;
8114 case LLM_ARCH_GROK:
8115 {
8116 llm = std::make_unique<llm_build_grok>(*this, params);
8117 } break;
8118 case LLM_ARCH_STARCODER:
8119 {
8120 llm = std::make_unique<llm_build_starcoder>(*this, params);
8121 } break;
8122 case LLM_ARCH_REFACT:
8123 {
8124 llm = std::make_unique<llm_build_refact>(*this, params);
8125 } break;
8126 case LLM_ARCH_BERT:
8127 case LLM_ARCH_JINA_BERT_V2:
8128 case LLM_ARCH_JINA_BERT_V3:
8129 case LLM_ARCH_NOMIC_BERT:
8130 case LLM_ARCH_NOMIC_BERT_MOE:
8131 {
8132 llm = std::make_unique<llm_build_bert>(*this, params);
8133 } break;
8134 case LLM_ARCH_MODERN_BERT:
8135 {
8136 llm = std::make_unique<llm_build_modern_bert>(*this, params);
8137 } break;
8138 case LLM_ARCH_NEO_BERT:
8139 {
8140 llm = std::make_unique<llm_build_neo_bert>(*this, params);
8141 } break;
8142 case LLM_ARCH_BLOOM:
8143 {
8144 llm = std::make_unique<llm_build_bloom>(*this, params);
8145 } break;
8146 case LLM_ARCH_MPT:
8147 {
8148 llm = std::make_unique<llm_build_mpt>(*this, params);
8149 } break;
8150 case LLM_ARCH_STABLELM:
8151 {
8152 llm = std::make_unique<llm_build_stablelm>(*this, params);
8153 } break;
8154 case LLM_ARCH_QWEN:
8155 {
8156 llm = std::make_unique<llm_build_qwen>(*this, params);
8157 } break;
8158 case LLM_ARCH_QWEN2:
8159 {
8160 llm = std::make_unique<llm_build_qwen2>(*this, params);
8161 } break;
8162 case LLM_ARCH_DREAM:
8163 {
8164 llm = std::make_unique<llm_build_dream>(*this, params);
8165 }
8166 break;
8167 case LLM_ARCH_LLADA:
8168 {
8169 llm = std::make_unique<llm_build_llada>(*this, params);
8170 }
8171 break;
8172 case LLM_ARCH_LLADA_MOE:
8173 {
8174 llm = std::make_unique<llm_build_llada_moe>(*this, params);
8175 }
8176 break;
8177 case LLM_ARCH_RND1:
8178 {
8179 llm = std::make_unique<llm_build_rnd1>(*this, params);
8180 }
8181 break;
8182 case LLM_ARCH_QWEN2VL:
8183 {
8184 llm = std::make_unique<llm_build_qwen2vl>(*this, params);
8185 } break;
8186 case LLM_ARCH_QWEN2MOE:
8187 {
8188 llm = std::make_unique<llm_build_qwen2moe>(*this, params);
8189 } break;
8190 case LLM_ARCH_QWEN3:
8191 {
8192 llm = std::make_unique<llm_build_qwen3>(*this, params);
8193 } break;
8194 case LLM_ARCH_QWEN3MOE:
8195 {
8196 llm = std::make_unique<llm_build_qwen3moe>(*this, params);
8197 } break;
8198 case LLM_ARCH_QWEN3VL:
8199 {
8200 llm = std::make_unique<llm_build_qwen3vl>(*this, params);
8201 } break;
8202 case LLM_ARCH_QWEN3VLMOE:
8203 {
8204 llm = std::make_unique<llm_build_qwen3vlmoe>(*this, params);
8205 } break;
8206 case LLM_ARCH_PHI2:
8207 {
8208 llm = std::make_unique<llm_build_phi2>(*this, params);
8209 } break;
8210 case LLM_ARCH_PHI3:
8211 case LLM_ARCH_PHIMOE:
8212 {
8213 if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8214 llm = std::make_unique<llm_build_phi3<true>> (*this, params);
8215 } else {
8216 llm = std::make_unique<llm_build_phi3<false>>(*this, params);
8217 }
8218 } break;
8219 case LLM_ARCH_PLAMO:
8220 {
8221 llm = std::make_unique<llm_build_plamo>(*this, params);
8222 } break;
8223 case LLM_ARCH_PLAMO2:
8224 {
8225 llm = std::make_unique<llm_build_plamo2>(*this, params);
8226 } break;
8227 case LLM_ARCH_PLAMO3:
8228 {
8229 if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8230 llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
8231 } else {
8232 llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
8233 }
8234 } break;
8235 case LLM_ARCH_GPT2:
8236 {
8237 llm = std::make_unique<llm_build_gpt2>(*this, params);
8238 } break;
8239 case LLM_ARCH_CODESHELL:
8240 {
8241 llm = std::make_unique<llm_build_codeshell>(*this, params);
8242 } break;
8243 case LLM_ARCH_ORION:
8244 {
8245 llm = std::make_unique<llm_build_orion>(*this, params);
8246 } break;
8247 case LLM_ARCH_INTERNLM2:
8248 {
8249 llm = std::make_unique<llm_build_internlm2>(*this, params);
8250 } break;
8251 case LLM_ARCH_MINICPM3:
8252 {
8253 llm = std::make_unique<llm_build_minicpm3>(*this, params);
8254 } break;
8255 case LLM_ARCH_GEMMA:
8256 {
8257 llm = std::make_unique<llm_build_gemma>(*this, params);
8258 } break;
8259 case LLM_ARCH_GEMMA2:
8260 {
8261 llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
8262 } break;
8263 case LLM_ARCH_GEMMA3:
8264 {
8265 if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
8266 llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
8267 } else {
8268 llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
8269 }
8270 } break;
8271 case LLM_ARCH_GEMMA3N:
8272 {
8273 llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
8274 } break;
8275 case LLM_ARCH_GEMMA_EMBEDDING:
8276 {
8277 llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
8278 } break;
8279 case LLM_ARCH_STARCODER2:
8280 {
8281 llm = std::make_unique<llm_build_starcoder2>(*this, params);
8282 } break;
8283 case LLM_ARCH_MAMBA:
8284 case LLM_ARCH_MAMBA2:
8285 {
8286 llm = std::make_unique<llm_build_mamba>(*this, params);
8287 } break;
8288 case LLM_ARCH_JAMBA:
8289 {
8290 llm = std::make_unique<llm_build_jamba>(*this, params);
8291 } break;
8292 case LLM_ARCH_XVERSE:
8293 {
8294 llm = std::make_unique<llm_build_xverse>(*this, params);
8295 } break;
8296 case LLM_ARCH_COMMAND_R:
8297 {
8298 llm = std::make_unique<llm_build_command_r>(*this, params);
8299 } break;
8300 case LLM_ARCH_COHERE2:
8301 {
8302 llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
8303 } break;
8304 case LLM_ARCH_DBRX:
8305 {
8306 llm = std::make_unique<llm_build_dbrx>(*this, params);
8307 } break;
8308 case LLM_ARCH_OLMO:
8309 {
8310 llm = std::make_unique<llm_build_olmo>(*this, params);
8311 } break;
8312 case LLM_ARCH_OLMO2:
8313 {
8314 if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
8315 llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
8316 } else {
8317 llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
8318 }
8319 } break;
8320 case LLM_ARCH_OLMOE:
8321 {
8322 llm = std::make_unique<llm_build_olmoe>(*this, params);
8323 } break;
8324 case LLM_ARCH_OPENELM:
8325 {
8326 llm = std::make_unique<llm_build_openelm>(*this, params);
8327 } break;
8328 case LLM_ARCH_GPTNEOX:
8329 {
8330 llm = std::make_unique<llm_build_gptneox>(*this, params);
8331 } break;
8332 case LLM_ARCH_ARCTIC:
8333 {
8334 llm = std::make_unique<llm_build_arctic>(*this, params);
8335 } break;
8336 case LLM_ARCH_DEEPSEEK:
8337 {
8338 llm = std::make_unique<llm_build_deepseek>(*this, params);
8339 } break;
8340 case LLM_ARCH_DEEPSEEK2:
8341 {
8342 llm = std::make_unique<llm_build_deepseek2>(*this, params);
8343 } break;
8344 case LLM_ARCH_CHATGLM:
8345 {
8346 llm = std::make_unique<llm_build_chatglm>(*this, params);
8347 } break;
8348 case LLM_ARCH_GLM4:
8349 {
8350 llm = std::make_unique<llm_build_glm4>(*this, params);
8351 } break;
8352 case LLM_ARCH_GLM4_MOE:
8353 {
8354 llm = std::make_unique<llm_build_glm4_moe>(*this, params);
8355 } break;
8356 case LLM_ARCH_BITNET:
8357 {
8358 llm = std::make_unique<llm_build_bitnet>(*this, params);
8359 } break;
8360 case LLM_ARCH_T5:
8361 {
8362 switch (params.gtype) {
8363 case LLM_GRAPH_TYPE_ENCODER:
8364 llm = std::make_unique<llm_build_t5_enc>(*this, params);
8365 break;
8366 case LLM_GRAPH_TYPE_DEFAULT:
8367 case LLM_GRAPH_TYPE_DECODER:
8368 llm = std::make_unique<llm_build_t5_dec>(*this, params);
8369 break;
8370 default:
8371 GGML_ABORT("invalid graph type");
8372 };
8373 } break;
8374 case LLM_ARCH_T5ENCODER:
8375 {
8376 llm = std::make_unique<llm_build_t5_enc>(*this, params);
8377 }
8378 break;
8379 case LLM_ARCH_JAIS:
8380 {
8381 llm = std::make_unique<llm_build_jais>(*this, params);
8382 } break;
8383 case LLM_ARCH_NEMOTRON:
8384 {
8385 llm = std::make_unique<llm_build_nemotron>(*this, params);
8386 } break;
8387 case LLM_ARCH_NEMOTRON_H:
8388 case LLM_ARCH_NEMOTRON_H_MOE:
8389 {
8390 llm = std::make_unique<llm_build_nemotron_h>(*this, params);
8391 } break;
8392 case LLM_ARCH_EXAONE:
8393 {
8394 llm = std::make_unique<llm_build_exaone>(*this, params);
8395 } break;
8396 case LLM_ARCH_EXAONE4:
8397 {
8398 if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
8399 llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
8400 } else {
8401 llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
8402 }
8403 } break;
8404 case LLM_ARCH_EXAONE_MOE:
8405 {
8406 llm = std::make_unique<llm_build_exaone_moe>(*this, params);
8407 } break;
8408 case LLM_ARCH_RWKV6:
8409 {
8410 llm = std::make_unique<llm_build_rwkv6>(*this, params);
8411 } break;
8412 case LLM_ARCH_RWKV6QWEN2:
8413 {
8414 llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
8415 } break;
8416 case LLM_ARCH_RWKV7:
8417 {
8418 llm = std::make_unique<llm_build_rwkv7>(*this, params);
8419 } break;
8420 case LLM_ARCH_ARWKV7:
8421 {
8422 llm = std::make_unique<llm_build_arwkv7>(*this, params);
8423 } break;
8424 case LLM_ARCH_GRANITE:
8425 case LLM_ARCH_GRANITE_MOE:
8426 case LLM_ARCH_MINICPM:
8427 {
8428 llm = std::make_unique<llm_build_granite>(*this, params);
8429 } break;
8430 case LLM_ARCH_GRANITE_HYBRID:
8431 {
8432 llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
8433 } break;
8434 case LLM_ARCH_CHAMELEON:
8435 {
8436 llm = std::make_unique<llm_build_chameleon>(*this, params);
8437 } break;
8438 case LLM_ARCH_WAVTOKENIZER_DEC:
8439 {
8440 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
8441 } break;
8442 case LLM_ARCH_PLM:
8443 {
8444 llm = std::make_unique<llm_build_plm>(*this, params);
8445 } break;
8446 case LLM_ARCH_BAILINGMOE:
8447 {
8448 llm = std::make_unique<llm_build_bailingmoe>(*this, params);
8449 } break;
8450 case LLM_ARCH_BAILINGMOE2:
8451 {
8452 llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
8453 } break;
8454 case LLM_ARCH_SEED_OSS:
8455 {
8456 llm = std::make_unique<llm_build_seed_oss>(*this, params);
8457 } break;
8458 case LLM_ARCH_DOTS1:
8459 {
8460 llm = std::make_unique<llm_build_dots1>(*this, params);
8461 } break;
8462 case LLM_ARCH_ARCEE:
8463 {
8464 llm = std::make_unique<llm_build_arcee>(*this, params);
8465 } break;
8466 case LLM_ARCH_AFMOE:
8467 {
8468 llm = std::make_unique<llm_build_afmoe>(*this, params);
8469 } break;
8470 case LLM_ARCH_ERNIE4_5:
8471 {
8472 llm = std::make_unique<llm_build_ernie4_5>(*this, params);
8473 } break;
8474 case LLM_ARCH_ERNIE4_5_MOE:
8475 {
8476 llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
8477 } break;
8478 case LLM_ARCH_HUNYUAN_MOE:
8479 {
8480 llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
8481 } break;
8482 case LLM_ARCH_HUNYUAN_DENSE:
8483 {
8484 llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
8485 } break;
8486 case LLM_ARCH_SMOLLM3:
8487 {
8488 llm = std::make_unique<llm_build_smollm3>(*this, params);
8489 } break;
8490 case LLM_ARCH_OPENAI_MOE:
8491 {
8492 llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
8493 } break;
8494 case LLM_ARCH_FALCON_H1:
8495 {
8496 llm = std::make_unique<llm_build_falcon_h1>(*this, params);
8497 } break;
8498 case LLM_ARCH_LFM2:
8499 case LLM_ARCH_LFM2MOE:
8500 {
8501 llm = std::make_unique<llm_build_lfm2>(*this, params);
8502 } break;
8503 case LLM_ARCH_SMALLTHINKER:
8504 {
8505 if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
8506 llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
8507 } else {
8508 llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
8509 }
8510 } break;
8511 case LLM_ARCH_GROVEMOE:
8512 {
8513 llm = std::make_unique<llm_build_grovemoe>(*this, params);
8514 } break;
8515 case LLM_ARCH_APERTUS:
8516 {
8517 llm = std::make_unique<llm_build_apertus>(*this, params);
8518 } break;
8519 case LLM_ARCH_MINIMAX_M2:
8520 {
8521 llm = std::make_unique<llm_build_minimax_m2>(*this, params);
8522 } break;
8523 case LLM_ARCH_COGVLM:
8524 {
8525 llm = std::make_unique<llm_build_cogvlm>(*this, params);
8526 } break;
8527 case LLM_ARCH_PANGU_EMBED:
8528 {
8529 llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
8530 } break;
8531 case LLM_ARCH_QWEN3NEXT:
8532 {
8533 llm = std::make_unique<llm_build_qwen3next>(*this, params);
8534 } break;
8535 case LLM_ARCH_QWEN35:
8536 {
8537 llm = std::make_unique<llm_build_qwen35>(*this, params);
8538 } break;
8539 case LLM_ARCH_QWEN35MOE:
8540 {
8541 llm = std::make_unique<llm_build_qwen35moe>(*this, params);
8542 } break;
8543 case LLM_ARCH_MISTRAL3:
8544 {
8545 llm = std::make_unique<llm_build_mistral3>(*this, params);
8546 } break;
8547 case LLM_ARCH_MIMO2:
8548 {
8549 llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
8550 } break;
8551 case LLM_ARCH_KIMI_LINEAR:
8552 {
8553 llm = std::make_unique<llm_build_kimi_linear>(*this, params);
8554 } break;
8555 case LLM_ARCH_STEP35:
8556 {
8557 llm = std::make_unique<llm_build_step35_iswa>(*this, params);
8558 } break;
8559 default:
8560 GGML_ABORT("fatal error");
8561 }
8562
8563 // add on pooling layer
8564 llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
8565
8566 // add backend sampling layers (if any)
8567 llm->build_sampling();
8568
8569 // if the gguf model was converted with --sentence-transformers-dense-modules
8570 // there will be two additional dense projection layers
8571 // dense linear projections are applied after pooling
8572 // TODO: move reranking logic here and generalize
8573 llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
8574
8575 llm->res->set_outputs();
8576
8577 return llm->res->get_gf();
8578}
8579
8580
8581//
8582// interface implementation
8583//
8584
8585llama_model_params llama_model_default_params() {
8586 llama_model_params result = {
8587 /*.devices =*/ nullptr,
8588 /*.tensor_buft_overrides =*/ nullptr,
8589 /*.n_gpu_layers =*/ -1,
8590 /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
8591 /*.main_gpu =*/ 0,
8592 /*.tensor_split =*/ nullptr,
8593 /*.progress_callback =*/ nullptr,
8594 /*.progress_callback_user_data =*/ nullptr,
8595 /*.kv_overrides =*/ nullptr,
8596 /*.vocab_only =*/ false,
8597 /*.use_mmap =*/ true,
8598 /*.use_direct_io =*/ false,
8599 /*.use_mlock =*/ false,
8600 /*.check_tensors =*/ false,
8601 /*.use_extra_bufts =*/ true,
8602 /*.no_host =*/ false,
8603 /*.no_alloc =*/ false,
8604 };
8605
8606 return result;
8607}
8608
8609const llama_vocab * llama_model_get_vocab(const llama_model * model) {
8610 return &model->vocab;
8611}
8612
8613void llama_free_model(llama_model * model) {
8614 llama_model_free(model);
8615}
8616
8617void llama_model_free(llama_model * model) {
8618 delete model;
8619}
8620
8621int32_t llama_model_n_ctx_train(const llama_model * model) {
8622 return model->hparams.n_ctx_train;
8623}
8624
8625int32_t llama_model_n_embd(const llama_model * model) {
8626 return model->hparams.n_embd;
8627}
8628
8629int32_t llama_model_n_embd_inp(const llama_model * model) {
8630 return model->hparams.n_embd_inp();
8631}
8632
8633int32_t llama_model_n_embd_out(const llama_model * model) {
8634 return model->hparams.n_embd_out();
8635}
8636
8637int32_t llama_model_n_layer(const llama_model * model) {
8638 return model->hparams.n_layer;
8639}
8640
8641int32_t llama_model_n_head(const llama_model * model) {
8642 return model->hparams.n_head();
8643}
8644
8645int32_t llama_model_n_head_kv(const llama_model * model) {
8646 return model->hparams.n_head_kv();
8647}
8648
8649int32_t llama_model_n_swa(const llama_model * model) {
8650 return model->hparams.n_swa;
8651}
8652
8653uint32_t llama_model_n_cls_out(const struct llama_model * model) {
8654 return model->hparams.n_cls_out;
8655}
8656
8657const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
8658 if (i < model->classifier_labels.size()) {
8659 return model->classifier_labels[i].c_str();
8660 }
8661
8662 return nullptr;
8663}
8664
8665// deprecated
8666int32_t llama_n_ctx_train(const llama_model * model) {
8667 return llama_model_n_ctx_train(model);
8668}
8669
8670// deprecated
8671int32_t llama_n_embd(const llama_model * model) {
8672 return llama_model_n_embd(model);
8673}
8674
8675// deprecated
8676int32_t llama_n_layer(const llama_model * model) {
8677 return llama_model_n_layer(model);
8678}
8679
8680// deprecated
8681int32_t llama_n_head(const llama_model * model) {
8682 return llama_model_n_head(model);
8683}
8684
8685llama_rope_type llama_model_rope_type(const llama_model * model) {
8686 switch (model->arch) {
8687 // these models do not use RoPE
8688 case LLM_ARCH_CLIP:
8689 case LLM_ARCH_GPT2:
8690 case LLM_ARCH_GPTJ:
8691 case LLM_ARCH_MPT:
8692 case LLM_ARCH_REFACT:
8693 case LLM_ARCH_BLOOM:
8694 case LLM_ARCH_MAMBA:
8695 case LLM_ARCH_MAMBA2:
8696 case LLM_ARCH_JAMBA:
8697 case LLM_ARCH_JINA_BERT_V2:
8698 case LLM_ARCH_T5:
8699 case LLM_ARCH_T5ENCODER:
8700 case LLM_ARCH_JAIS:
8701 case LLM_ARCH_RWKV6:
8702 case LLM_ARCH_RWKV6QWEN2:
8703 case LLM_ARCH_RWKV7:
8704 case LLM_ARCH_ARWKV7:
8705 case LLM_ARCH_WAVTOKENIZER_DEC:
8706 case LLM_ARCH_NEMOTRON_H:
8707 case LLM_ARCH_NEMOTRON_H_MOE:
8708 case LLM_ARCH_KIMI_LINEAR:
8709 return LLAMA_ROPE_TYPE_NONE;
8710
8711 // use what we call a normal RoPE, operating on pairs of consecutive head values
8712 case LLM_ARCH_LLAMA:
8713 case LLM_ARCH_LLADA:
8714 case LLM_ARCH_LLAMA4:
8715 case LLM_ARCH_DECI:
8716 case LLM_ARCH_BAICHUAN:
8717 case LLM_ARCH_STARCODER:
8718 case LLM_ARCH_INTERNLM2:
8719 case LLM_ARCH_MINICPM:
8720 case LLM_ARCH_XVERSE:
8721 case LLM_ARCH_COMMAND_R:
8722 case LLM_ARCH_COHERE2:
8723 case LLM_ARCH_OLMO:
8724 case LLM_ARCH_ARCTIC:
8725 case LLM_ARCH_DEEPSEEK:
8726 case LLM_ARCH_DEEPSEEK2:
8727 case LLM_ARCH_PLM:
8728 case LLM_ARCH_CHATGLM:
8729 case LLM_ARCH_GRANITE:
8730 case LLM_ARCH_GRANITE_MOE:
8731 case LLM_ARCH_GRANITE_HYBRID:
8732 case LLM_ARCH_CHAMELEON:
8733 case LLM_ARCH_BAILINGMOE:
8734 case LLM_ARCH_NEO_BERT:
8735 case LLM_ARCH_SMOLLM3:
8736 case LLM_ARCH_ARCEE:
8737 case LLM_ARCH_ERNIE4_5:
8738 case LLM_ARCH_ERNIE4_5_MOE:
8739 case LLM_ARCH_MISTRAL3:
8740 case LLM_ARCH_LLAMA_EMBED:
8741 case LLM_ARCH_MAINCODER:
8742 return LLAMA_ROPE_TYPE_NORM;
8743
8744 // the pairs of head values are offset by n_rot/2
8745 case LLM_ARCH_FALCON:
8746 case LLM_ARCH_FALCON_H1:
8747 case LLM_ARCH_GROK:
8748 case LLM_ARCH_DBRX:
8749 case LLM_ARCH_BERT:
8750 case LLM_ARCH_JINA_BERT_V3:
8751 case LLM_ARCH_MODERN_BERT:
8752 case LLM_ARCH_NOMIC_BERT:
8753 case LLM_ARCH_NOMIC_BERT_MOE:
8754 case LLM_ARCH_STABLELM:
8755 case LLM_ARCH_BITNET:
8756 case LLM_ARCH_QWEN:
8757 case LLM_ARCH_QWEN2:
8758 case LLM_ARCH_DREAM:
8759 case LLM_ARCH_QWEN2MOE:
8760 case LLM_ARCH_QWEN3:
8761 case LLM_ARCH_QWEN3MOE:
8762 case LLM_ARCH_LLADA_MOE:
8763 case LLM_ARCH_RND1:
8764 case LLM_ARCH_OLMO2:
8765 case LLM_ARCH_OLMOE:
8766 case LLM_ARCH_PHI2:
8767 case LLM_ARCH_PHI3:
8768 case LLM_ARCH_PHIMOE:
8769 case LLM_ARCH_PLAMO:
8770 case LLM_ARCH_PLAMO2:
8771 case LLM_ARCH_PLAMO3:
8772 case LLM_ARCH_GEMMA:
8773 case LLM_ARCH_GEMMA2:
8774 case LLM_ARCH_GEMMA3:
8775 case LLM_ARCH_GEMMA3N:
8776 case LLM_ARCH_GEMMA_EMBEDDING:
8777 case LLM_ARCH_STARCODER2:
8778 case LLM_ARCH_OPENELM:
8779 case LLM_ARCH_GPTNEOX:
8780 case LLM_ARCH_CODESHELL:
8781 case LLM_ARCH_ORION:
8782 case LLM_ARCH_NEMOTRON:
8783 case LLM_ARCH_EXAONE:
8784 case LLM_ARCH_EXAONE4:
8785 case LLM_ARCH_EXAONE_MOE:
8786 case LLM_ARCH_MINICPM3:
8787 case LLM_ARCH_BAILINGMOE2:
8788 case LLM_ARCH_DOTS1:
8789 case LLM_ARCH_HUNYUAN_MOE:
8790 case LLM_ARCH_OPENAI_MOE:
8791 case LLM_ARCH_HUNYUAN_DENSE:
8792 case LLM_ARCH_LFM2:
8793 case LLM_ARCH_LFM2MOE:
8794 case LLM_ARCH_SMALLTHINKER:
8795 case LLM_ARCH_SEED_OSS:
8796 case LLM_ARCH_GROVEMOE:
8797 case LLM_ARCH_APERTUS:
8798 case LLM_ARCH_MINIMAX_M2:
8799 case LLM_ARCH_COGVLM:
8800 case LLM_ARCH_PANGU_EMBED:
8801 case LLM_ARCH_AFMOE:
8802 case LLM_ARCH_QWEN3NEXT:
8803 case LLM_ARCH_MIMO2:
8804 case LLM_ARCH_STEP35:
8805 return LLAMA_ROPE_TYPE_NEOX;
8806
8807 case LLM_ARCH_QWEN2VL:
8808 return LLAMA_ROPE_TYPE_MROPE;
8809 case LLM_ARCH_QWEN3VL:
8810 case LLM_ARCH_QWEN3VLMOE:
8811 case LLM_ARCH_QWEN35:
8812 case LLM_ARCH_QWEN35MOE:
8813 return LLAMA_ROPE_TYPE_IMROPE;
8814
8815 case LLM_ARCH_GLM4:
8816 return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
8817 case LLM_ARCH_GLM4_MOE:
8818 return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
8819
8820 // all model arches should be listed explicitly here
8821 case LLM_ARCH_UNKNOWN:
8822 GGML_ABORT("unknown architecture");
8823 }
8824
8825 return LLAMA_ROPE_TYPE_NONE;
8826}
8827
8828float llama_model_rope_freq_scale_train(const llama_model * model) {
8829 return model->hparams.rope_freq_scale_train;
8830}
8831
8832int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
8833 const auto & it = model->gguf_kv.find(key);
8834 if (it == model->gguf_kv.end()) {
8835 if (buf_size > 0) {
8836 buf[0] = '\0';
8837 }
8838 return -1;
8839 }
8840 return snprintf(buf, buf_size, "%s", it->second.c_str());
8841}
8842
8843int32_t llama_model_meta_count(const llama_model * model) {
8844 return (int)model->gguf_kv.size();
8845}
8846
8847const char * llama_model_meta_key_str(llama_model_meta_key key) {
8848 switch (key) {
8849 case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence";
8850 case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k";
8851 case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p";
8852 case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p";
8853 case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
8854 case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold";
8855 case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp";
8856 case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n";
8857 case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat";
8858 case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat";
8859 case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau";
8860 case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta";
8861 default: return nullptr;
8862 }
8863}
8864
8865int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
8866 if (i < 0 || i >= (int)model->gguf_kv.size()) {
8867 if (buf_size > 0) {
8868 buf[0] = '\0';
8869 }
8870 return -1;
8871 }
8872 auto it = model->gguf_kv.begin();
8873 std::advance(it, i);
8874 return snprintf(buf, buf_size, "%s", it->first.c_str());
8875}
8876
8877int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
8878 if (i < 0 || i >= (int)model->gguf_kv.size()) {
8879 if (buf_size > 0) {
8880 buf[0] = '\0';
8881 }
8882 return -1;
8883 }
8884 auto it = model->gguf_kv.begin();
8885 std::advance(it, i);
8886 return snprintf(buf, buf_size, "%s", it->second.c_str());
8887}
8888
8889int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
8890 return snprintf(buf, buf_size, "%s", model->desc().c_str());
8891}
8892
8893uint64_t llama_model_size(const llama_model * model) {
8894 return model->size();
8895}
8896
8897const char * llama_model_chat_template(const llama_model * model, const char * name) {
8898 const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
8899 : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
8900 const auto & it = model->gguf_kv.find(key);
8901 if (it == model->gguf_kv.end()) {
8902 // one-off fix for very popular models (so we are not flooded with issues)
8903 // do not extend this list unless absolutely necessary
8904 // Mistral-Small-2503 does not have built-in chat template
8905 llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
8906 if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
8907 return "mistral-v7-tekken";
8908 }
8909
8910 return nullptr;
8911 }
8912
8913 return it->second.c_str();
8914}
8915
8916uint64_t llama_model_n_params(const llama_model * model) {
8917 return model->n_elements();
8918}
8919
8920bool llama_model_has_encoder(const llama_model * model) {
8921 switch (model->arch) {
8922 case LLM_ARCH_T5: return true;
8923 case LLM_ARCH_T5ENCODER: return true;
8924 default: return false;
8925 }
8926}
8927
8928bool llama_model_has_decoder(const llama_model * model) {
8929 switch (model->arch) {
8930 case LLM_ARCH_T5ENCODER: return false;
8931 default: return true;
8932 }
8933}
8934
8935llama_token llama_model_decoder_start_token(const llama_model * model) {
8936 return model->hparams.dec_start_token_id;
8937}
8938
8939bool llama_model_is_recurrent(const llama_model * model) {
8940 return llm_arch_is_recurrent(model->arch);
8941}
8942
8943bool llama_model_is_hybrid(const llama_model * model) {
8944 return llm_arch_is_hybrid(model->arch);
8945}
8946
8947bool llama_model_is_diffusion(const llama_model * model) {
8948 return llm_arch_is_diffusion(model->arch);
8949}
8950
8951const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
8952 return model->tensors_by_name;
8953}