archive llama.cpp-b8008.tar.gz
corpus lotr.txt map1_bromm.txt map1_dagna.txt map1_keldor.txt map1_skara.txt map1_thrain.txt
llama.cpp
.devops
nix apps.nix devshells.nix docker.nix jetson-support.nix nixpkgs-instances.nix package-gguf-py.nix package.nix python-scripts.nix scope.nix sif.nix
cann.Dockerfile cpu.Dockerfile cuda-new.Dockerfile cuda.Dockerfile intel.Dockerfile llama-cli-cann.Dockerfile llama-cpp-cuda.srpm.spec llama-cpp.srpm.spec musa.Dockerfile rocm.Dockerfile s390x.Dockerfile tools.sh vulkan.Dockerfile
.gemini settings.json
.github
ISSUE_TEMPLATE 010-bug-compilation.yml 011-bug-results.yml 019-bug-misc.yml 020-enhancement.yml 030-research.yml 040-refactor.yml config.yml
actions
get-tag-name action.yml
install-exe action.yml
linux-setup-spacemit action.yml
linux-setup-vulkan action.yml
unarchive-tar action.yml
windows-setup-cuda action.yml
windows-setup-rocm action.yml
workflows bench.yml.disabled build-cache.yml build-cmake-pkg.yml build-linux-cross.yml build.yml check-vendor.yml close-issue.yml copilot-setup-steps.yml docker.yml editorconfig.yml gguf-publish.yml labeler.yml pre-tokenizer-hashes.yml python-check-requirements.yml python-lint.yml python-type-check.yml release.yml server-metal.yml server-webui.yml server.yml update-ops-docs.yml winget.yml
labeler.yml pull_request_template.md
benches
dgx-spark aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.html aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.json aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547_allresults.json dgx-spark.md
mac-m2-ultra mac-m2-ultra.md
ci README-MUSA.md README.md run.sh
cmake arm64-apple-clang.cmake arm64-windows-llvm.cmake build-info.cmake common.cmake download-models.cmake git-vars.cmake license.cmake llama-config.cmake.in llama.pc.in riscv64-spacemit-linux-gnu-gcc.cmake x64-windows-llvm.cmake
common
jinja README.md caps.cpp caps.h lexer.cpp lexer.h parser.cpp parser.h runtime.cpp runtime.h string.cpp string.h utils.h value.cpp value.h
CMakeLists.txt arg.cpp arg.h base64.hpp build-info.cpp.in chat-parser-xml-toolcall.cpp chat-parser-xml-toolcall.h chat-parser.cpp chat-parser.h chat-peg-parser.cpp chat-peg-parser.h chat.cpp chat.h common.cpp common.h console.cpp console.h debug.cpp debug.h download.cpp download.h http.h json-partial.cpp json-partial.h json-schema-to-grammar.cpp json-schema-to-grammar.h llguidance.cpp log.cpp log.h ngram-cache.cpp ngram-cache.h ngram-map.cpp ngram-map.h ngram-mod.cpp ngram-mod.h peg-parser.cpp peg-parser.h preset.cpp preset.h regex-partial.cpp regex-partial.h sampling.cpp sampling.h speculative.cpp speculative.h unicode.cpp unicode.h
docs
android imported-into-android-studio.jpg
backend
VirtGPU configuration.md development.md
snapdragon CMakeUserPresets.json README.md developer.md windows.md
BLIS.md CANN.md CUDA-FEDORA.md OPENCL.md SYCL.md VirtGPU.md ZenDNN.md zDNN.md
development
llama-star idea-arch.key idea-arch.pdf
HOWTO-add-model.md debugging-tests.md parsing.md token_generation_performance_tips.md
multimodal MobileVLM.md gemma3.md glmedge.md granitevision.md llava.md minicpmo2.6.md minicpmo4.0.md minicpmv2.5.md minicpmv2.6.md minicpmv4.0.md minicpmv4.5.md
ops BLAS.csv CANN.csv CPU.csv CUDA.csv Metal.csv OpenCL.csv SYCL.csv Vulkan.csv WebGPU.csv ZenDNN.csv zDNN.csv
android.md build-riscv64-spacemit.md build-s390x.md build.md docker.md function-calling.md install.md llguidance.md multimodal.md ops.md preset.md speculative.md
examples
batched CMakeLists.txt README.md batched.cpp
batched.swift
Sources main.swift
.gitignore Makefile Package.swift README.md
convert-llama2c-to-ggml CMakeLists.txt README.md convert-llama2c-to-ggml.cpp
debug CMakeLists.txt README.md debug.cpp
deprecation-warning README.md deprecation-warning.cpp
diffusion CMakeLists.txt README.md diffusion-cli.cpp
embedding CMakeLists.txt README.md embedding.cpp
eval-callback CMakeLists.txt README.md eval-callback.cpp
gen-docs CMakeLists.txt gen-docs.cpp
gguf CMakeLists.txt gguf.cpp
gguf-hash
deps
rotate-bits package.json rotate-bits.h
sha1 package.json sha1.c sha1.h
sha256 package.json sha256.c sha256.h
xxhash clib.json xxhash.c xxhash.h
CMakeLists.txt README.md gguf-hash.cpp
idle CMakeLists.txt README.md idle.cpp
llama.android
app
src
main
java
com
example
llama MainActivity.kt MessageAdapter.kt
res
drawable bg_assistant_message.xml bg_user_message.xml ic_launcher_background.xml ic_launcher_foreground.xml outline_folder_open_24.xml outline_send_24.xml
layout activity_main.xml item_message_assistant.xml item_message_user.xml
mipmap-anydpi ic_launcher.xml ic_launcher_round.xml
mipmap-hdpi ic_launcher.webp ic_launcher_round.webp
mipmap-mdpi ic_launcher.webp ic_launcher_round.webp
mipmap-xhdpi ic_launcher.webp ic_launcher_round.webp
mipmap-xxhdpi ic_launcher.webp ic_launcher_round.webp
mipmap-xxxhdpi ic_launcher.webp ic_launcher_round.webp
values colors.xml strings.xml themes.xml
xml backup_rules.xml data_extraction_rules.xml
AndroidManifest.xml
.gitignore build.gradle.kts proguard-rules.pro
gradle
wrapper gradle-wrapper.jar gradle-wrapper.properties
libs.versions.toml
lib
src
androidTest
java
android
llama
cpp ExampleInstrumentedTest.kt
main
cpp CMakeLists.txt ai_chat.cpp logging.h
java
com
arm
aichat
gguf FileType.kt GgufMetadata.kt GgufMetadataReader.kt
internal
gguf GgufMetadataReaderImpl.kt
InferenceEngineImpl.kt
AiChat.kt InferenceEngine.kt
AndroidManifest.xml
test
java
android
llama
cpp ExampleUnitTest.kt
.gitignore build.gradle.kts consumer-rules.pro proguard-rules.pro
.gitignore build.gradle.kts gradle.properties gradlew settings.gradle.kts
llama.swiftui
llama.cpp.swift LibLlama.swift
llama.swiftui
Assets.xcassets
AppIcon.appiconset Contents.json
Contents.json
Models LlamaState.swift
Resources
models .gitignore
UI ContentView.swift DownloadButton.swift InputButton.swift LoadCustomButton.swift
llama_swiftuiApp.swift
llama.swiftui.xcodeproj
project.xcworkspace contents.xcworkspacedata
project.pbxproj
.gitignore README.md
lookahead CMakeLists.txt README.md lookahead.cpp
lookup CMakeLists.txt README.md lookup-create.cpp lookup-merge.cpp lookup-stats.cpp lookup.cpp
model-conversion
scripts
causal compare-embeddings-logits.sh compare-logits.py convert-model.sh modelcard.template run-casual-gen-embeddings-org.py run-converted-model-embeddings-logits.sh run-converted-model.sh run-org-model.py
embedding compare-embeddings-logits.sh convert-model.sh modelcard.template run-converted-model.sh run-original-model.py
utils __init__.py check-nmse.py common.py compare_tokens.py create-collection-add-model.sh curl-embedding-server.sh hf-add-model-to-collection.py hf-create-collection.py hf-create-model.py hf-upload-gguf-model.py inspect-converted-model.sh inspect-org-model.py perplexity-gen.sh perplexity-run-simple.sh perplexity-run.sh quantize.sh run-embedding-server.sh semantic_check.py tensor-info.py
.gitignore Makefile README.md requirements.txt
parallel CMakeLists.txt README.md parallel.cpp
passkey CMakeLists.txt README.md passkey.cpp
retrieval CMakeLists.txt README.md retrieval.cpp
save-load-state CMakeLists.txt save-load-state.cpp
simple CMakeLists.txt README.md simple.cpp
simple-chat CMakeLists.txt README.md simple-chat.cpp
simple-cmake-pkg .gitignore CMakeLists.txt README.md
speculative CMakeLists.txt README.md speculative.cpp
speculative-simple CMakeLists.txt README.md speculative-simple.cpp
sycl CMakeLists.txt README.md build.sh ls-sycl-device.cpp run-llama2.sh test.sh win-build-sycl.bat win-run-llama2.bat win-test.bat
training CMakeLists.txt README.md finetune.cpp
CMakeLists.txt convert_legacy_llama.py json_schema_pydantic_example.py json_schema_to_grammar.py llama.vim pydantic_models_to_grammar.py pydantic_models_to_grammar_examples.py reason-act.sh regex_to_grammar.py server-llama2-13B.sh server_embd.py ts-type-to-grammar.sh
ggml
cmake GitVars.cmake common.cmake ggml-config.cmake.in
include ggml-alloc.h ggml-backend.h ggml-blas.h ggml-cann.h ggml-cpp.h ggml-cpu.h ggml-cuda.h ggml-hexagon.h ggml-metal.h ggml-opencl.h ggml-opt.h ggml-rpc.h ggml-sycl.h ggml-virtgpu.h ggml-vulkan.h ggml-webgpu.h ggml-zdnn.h ggml-zendnn.h ggml.h gguf.h
src
ggml-blas CMakeLists.txt ggml-blas.cpp
ggml-cann CMakeLists.txt acl_tensor.cpp acl_tensor.h aclnn_ops.cpp aclnn_ops.h common.h ggml-cann.cpp
ggml-cpu
amx amx.cpp amx.h common.h mmq.cpp mmq.h
arch
arm cpu-feats.cpp quants.c repack.cpp
loongarch quants.c
powerpc cpu-feats.cpp quants.c
riscv cpu-feats.cpp quants.c repack.cpp
s390 cpu-feats.cpp quants.c
wasm quants.c
x86 cpu-feats.cpp quants.c repack.cpp
cmake FindSIMD.cmake
kleidiai kernels.cpp kernels.h kleidiai.cpp kleidiai.h
llamafile sgemm-ppc.h sgemm.cpp sgemm.h
spacemit ime.cpp ime.h ime1_kernels.cpp ime_kernels.h
CMakeLists.txt arch-fallback.h binary-ops.cpp binary-ops.h common.h ggml-cpu-impl.h ggml-cpu.c ggml-cpu.cpp hbm.cpp hbm.h ops.cpp ops.h quants.c quants.h repack.cpp repack.h simd-mappings.h traits.cpp traits.h unary-ops.cpp unary-ops.h vec.cpp vec.h
ggml-cuda
template-instances fattn-mma-f16-instance-ncols1_1-ncols2_16.cu fattn-mma-f16-instance-ncols1_1-ncols2_32.cu fattn-mma-f16-instance-ncols1_1-ncols2_8.cu fattn-mma-f16-instance-ncols1_16-ncols2_1.cu fattn-mma-f16-instance-ncols1_16-ncols2_2.cu fattn-mma-f16-instance-ncols1_16-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_16.cu fattn-mma-f16-instance-ncols1_2-ncols2_32.cu fattn-mma-f16-instance-ncols1_2-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_8.cu fattn-mma-f16-instance-ncols1_32-ncols2_1.cu fattn-mma-f16-instance-ncols1_32-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_16.cu fattn-mma-f16-instance-ncols1_4-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_4.cu fattn-mma-f16-instance-ncols1_4-ncols2_8.cu fattn-mma-f16-instance-ncols1_64-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_2.cu fattn-mma-f16-instance-ncols1_8-ncols2_4.cu fattn-mma-f16-instance-ncols1_8-ncols2_8.cu fattn-tile-instance-dkq112-dv112.cu fattn-tile-instance-dkq128-dv128.cu fattn-tile-instance-dkq256-dv256.cu fattn-tile-instance-dkq40-dv40.cu fattn-tile-instance-dkq576-dv512.cu fattn-tile-instance-dkq64-dv64.cu fattn-tile-instance-dkq72-dv72.cu fattn-tile-instance-dkq80-dv80.cu fattn-tile-instance-dkq96-dv96.cu fattn-vec-instance-f16-f16.cu fattn-vec-instance-f16-q4_0.cu fattn-vec-instance-f16-q4_1.cu fattn-vec-instance-f16-q5_0.cu fattn-vec-instance-f16-q5_1.cu fattn-vec-instance-f16-q8_0.cu fattn-vec-instance-q4_0-f16.cu fattn-vec-instance-q4_0-q4_0.cu fattn-vec-instance-q4_0-q4_1.cu fattn-vec-instance-q4_0-q5_0.cu fattn-vec-instance-q4_0-q5_1.cu fattn-vec-instance-q4_0-q8_0.cu fattn-vec-instance-q4_1-f16.cu fattn-vec-instance-q4_1-q4_0.cu fattn-vec-instance-q4_1-q4_1.cu fattn-vec-instance-q4_1-q5_0.cu fattn-vec-instance-q4_1-q5_1.cu fattn-vec-instance-q4_1-q8_0.cu fattn-vec-instance-q5_0-f16.cu fattn-vec-instance-q5_0-q4_0.cu fattn-vec-instance-q5_0-q4_1.cu fattn-vec-instance-q5_0-q5_0.cu fattn-vec-instance-q5_0-q5_1.cu fattn-vec-instance-q5_0-q8_0.cu fattn-vec-instance-q5_1-f16.cu fattn-vec-instance-q5_1-q4_0.cu fattn-vec-instance-q5_1-q4_1.cu fattn-vec-instance-q5_1-q5_0.cu fattn-vec-instance-q5_1-q5_1.cu fattn-vec-instance-q5_1-q8_0.cu fattn-vec-instance-q8_0-f16.cu fattn-vec-instance-q8_0-q4_0.cu fattn-vec-instance-q8_0-q4_1.cu fattn-vec-instance-q8_0-q5_0.cu fattn-vec-instance-q8_0-q5_1.cu fattn-vec-instance-q8_0-q8_0.cu generate_cu_files.py mmf-instance-ncols_1.cu mmf-instance-ncols_10.cu mmf-instance-ncols_11.cu mmf-instance-ncols_12.cu mmf-instance-ncols_13.cu mmf-instance-ncols_14.cu mmf-instance-ncols_15.cu mmf-instance-ncols_16.cu mmf-instance-ncols_2.cu mmf-instance-ncols_3.cu mmf-instance-ncols_4.cu mmf-instance-ncols_5.cu mmf-instance-ncols_6.cu mmf-instance-ncols_7.cu mmf-instance-ncols_8.cu mmf-instance-ncols_9.cu mmq-instance-iq1_s.cu mmq-instance-iq2_s.cu mmq-instance-iq2_xs.cu mmq-instance-iq2_xxs.cu mmq-instance-iq3_s.cu mmq-instance-iq3_xxs.cu mmq-instance-iq4_nl.cu mmq-instance-iq4_xs.cu mmq-instance-mxfp4.cu mmq-instance-q2_k.cu mmq-instance-q3_k.cu mmq-instance-q4_0.cu mmq-instance-q4_1.cu mmq-instance-q4_k.cu mmq-instance-q5_0.cu mmq-instance-q5_1.cu mmq-instance-q5_k.cu mmq-instance-q6_k.cu mmq-instance-q8_0.cu
vendors cuda.h hip.h musa.h
CMakeLists.txt acc.cu acc.cuh add-id.cu add-id.cuh arange.cu arange.cuh argmax.cu argmax.cuh argsort.cu argsort.cuh binbcast.cu binbcast.cuh clamp.cu clamp.cuh common.cuh concat.cu concat.cuh conv-transpose-1d.cu conv-transpose-1d.cuh conv2d-dw.cu conv2d-dw.cuh conv2d-transpose.cu conv2d-transpose.cuh conv2d.cu conv2d.cuh convert.cu convert.cuh count-equal.cu count-equal.cuh cp-async.cuh cpy-utils.cuh cpy.cu cpy.cuh cross-entropy-loss.cu cross-entropy-loss.cuh cumsum.cu cumsum.cuh dequantize.cuh diag.cu diag.cuh diagmask.cu diagmask.cuh fattn-common.cuh fattn-mma-f16.cuh fattn-tile.cu fattn-tile.cuh fattn-vec.cuh fattn-wmma-f16.cu fattn-wmma-f16.cuh fattn.cu fattn.cuh fill.cu fill.cuh getrows.cu getrows.cuh ggml-cuda.cu gla.cu gla.cuh im2col.cu im2col.cuh mean.cu mean.cuh mma.cuh mmf.cu mmf.cuh mmid.cu mmid.cuh mmq.cu mmq.cuh mmvf.cu mmvf.cuh mmvq.cu mmvq.cuh norm.cu norm.cuh opt-step-adamw.cu opt-step-adamw.cuh opt-step-sgd.cu opt-step-sgd.cuh out-prod.cu out-prod.cuh pad.cu pad.cuh pad_reflect_1d.cu pad_reflect_1d.cuh pool2d.cu pool2d.cuh quantize.cu quantize.cuh reduce_rows.cuh roll.cu roll.cuh rope.cu rope.cuh scale.cu scale.cuh set-rows.cu set-rows.cuh set.cu set.cuh softcap.cu softcap.cuh softmax.cu softmax.cuh solve_tri.cu solve_tri.cuh ssm-conv.cu ssm-conv.cuh ssm-scan.cu ssm-scan.cuh sum.cu sum.cuh sumrows.cu sumrows.cuh top-k.cu top-k.cuh topk-moe.cu topk-moe.cuh tri.cu tri.cuh tsembd.cu tsembd.cuh unary.cu unary.cuh upscale.cu upscale.cuh vecdotq.cuh wkv.cu wkv.cuh
ggml-hexagon
htp CMakeLists.txt act-ops.c argsort-ops.c binary-ops.c cmake-toolchain.cmake cpy-ops.c flash-attn-ops.c get-rows-ops.c hex-dma.c hex-dma.h hex-dump.h hex-fastdiv.h hex-utils.h htp-ctx.h htp-msg.h htp-ops.h htp_iface.idl hvx-arith.h hvx-base.h hvx-copy.h hvx-div.h hvx-dump.h hvx-exp.h hvx-floor.h hvx-inverse.h hvx-reduce.h hvx-scale.h hvx-sigmoid.h hvx-sqrt.h hvx-types.h hvx-utils.h main.c matmul-ops.c rope-ops.c set-rows-ops.c softmax-ops.c sum-rows-ops.c unary-ops.c worker-pool.c worker-pool.h
CMakeLists.txt ggml-hexagon.cpp htp-drv.cpp htp-drv.h libdl.h libggml-htp.inf op-desc.h
ggml-hip CMakeLists.txt
ggml-metal CMakeLists.txt ggml-metal-common.cpp ggml-metal-common.h ggml-metal-context.h ggml-metal-context.m ggml-metal-device.cpp ggml-metal-device.h ggml-metal-device.m ggml-metal-impl.h ggml-metal-ops.cpp ggml-metal-ops.h ggml-metal.cpp ggml-metal.metal
ggml-musa CMakeLists.txt mudnn.cu mudnn.cuh
ggml-opencl
kernels add.cl add_id.cl argsort.cl clamp.cl concat.cl conv2d.cl conv2d_f16_f32.cl cpy.cl cvt.cl diag_mask_inf.cl div.cl embed_kernel.py expm1.cl fill.cl flash_attn_f16.cl flash_attn_f32.cl flash_attn_f32_f16.cl gelu.cl gemm_moe_mxfp4_f32.cl gemv_moe_mxfp4_f32.cl gemv_noshuffle.cl gemv_noshuffle_general.cl gemv_noshuffle_general_q8_0_f32.cl get_rows.cl glu.cl group_norm.cl im2col_f16.cl im2col_f32.cl mean.cl mul.cl mul_mat_Ab_Bi_8x4.cl mul_mat_f16_f32.cl mul_mm_f16_f32_kq_kqv.cl mul_mm_f16_f32_l4_lm.cl mul_mm_f32_f32_l4_lm.cl mul_mm_q6_k_f32_l4_lm.cl mul_mm_q8_0_f32_8x4.cl mul_mm_q8_0_f32_l4_lm.cl mul_mv_f16_f16.cl mul_mv_f16_f32.cl mul_mv_f16_f32_1row.cl mul_mv_f16_f32_l4.cl mul_mv_f32_f32.cl mul_mv_id_mxfp4_f32.cl mul_mv_id_mxfp4_f32_flat.cl mul_mv_id_q4_0_f32_8x_flat.cl mul_mv_id_q8_0_f32.cl mul_mv_id_q8_0_f32_flat.cl mul_mv_mxfp4_f32.cl mul_mv_mxfp4_f32_flat.cl mul_mv_q4_0_f32.cl mul_mv_q4_0_f32_1d_16x_flat.cl mul_mv_q4_0_f32_1d_8x_flat.cl mul_mv_q4_0_f32_8x_flat.cl mul_mv_q4_0_f32_v.cl mul_mv_q4_k_f32.cl mul_mv_q6_k_f32.cl mul_mv_q6_k_f32_flat.cl mul_mv_q8_0_f32.cl mul_mv_q8_0_f32_flat.cl norm.cl pad.cl relu.cl repeat.cl rms_norm.cl rope.cl scale.cl set_rows.cl sigmoid.cl silu.cl softmax_4_f16.cl softmax_4_f32.cl softmax_f16.cl softmax_f32.cl softplus.cl solve_tri.cl sqr.cl sqrt.cl ssm_conv.cl sub.cl sum_rows.cl tanh.cl transpose.cl tri.cl tsembd.cl upscale.cl
CMakeLists.txt ggml-opencl.cpp
ggml-rpc CMakeLists.txt ggml-rpc.cpp
ggml-sycl
dpct helper.hpp
CMakeLists.txt add-id.cpp add-id.hpp backend.hpp binbcast.cpp binbcast.hpp common.cpp common.hpp concat.cpp concat.hpp conv.cpp conv.hpp convert.cpp convert.hpp count-equal.cpp count-equal.hpp cpy.cpp cpy.hpp dequantize.hpp dmmv.cpp dmmv.hpp element_wise.cpp element_wise.hpp gemm.hpp getrows.cpp getrows.hpp ggml-sycl.cpp gla.cpp gla.hpp im2col.cpp im2col.hpp mmq.cpp mmq.hpp mmvq.cpp mmvq.hpp norm.cpp norm.hpp outprod.cpp outprod.hpp pad.cpp pad.hpp pad_reflect_1d.cpp pad_reflect_1d.hpp presets.hpp quantize.hpp quants.hpp repeat_back.cpp repeat_back.hpp roll.cpp roll.hpp rope.cpp rope.hpp set.cpp set.hpp set_rows.cpp set_rows.hpp softmax.cpp softmax.hpp ssm_conv.cpp ssm_conv.hpp sycl_hw.cpp sycl_hw.hpp tsembd.cpp tsembd.hpp vecdotq.hpp wkv.cpp wkv.hpp
ggml-virtgpu
backend
shared api_remoting.h apir_backend.gen.h apir_backend.h apir_cs.h apir_cs_ggml.h apir_cs_rpc.h
CMakeLists.txt apir_cs_ggml-rpc-back.cpp backend-convert.h backend-dispatched-backend.cpp backend-dispatched-buffer-type.cpp backend-dispatched-buffer.cpp backend-dispatched-device.cpp backend-dispatched.cpp backend-dispatched.gen.h backend-dispatched.h backend-virgl-apir.h backend.cpp
include apir_hw.h
CMakeLists.txt apir_cs_ggml-rpc-front.cpp ggml-backend-buffer-type.cpp ggml-backend-buffer.cpp ggml-backend-device.cpp ggml-backend-reg.cpp ggml-backend.cpp ggml-remoting.h ggmlremoting_functions.yaml regenerate_remoting.py virtgpu-apir.h virtgpu-forward-backend.cpp virtgpu-forward-buffer-type.cpp virtgpu-forward-buffer.cpp virtgpu-forward-device.cpp virtgpu-forward-impl.h virtgpu-forward.gen.h virtgpu-shm.cpp virtgpu-shm.h virtgpu-utils.cpp virtgpu-utils.h virtgpu.cpp virtgpu.h
ggml-vulkan
cmake host-toolchain.cmake.in
vulkan-shaders
feature-tests bfloat16.comp coopmat.comp coopmat2.comp integer_dot.comp
CMakeLists.txt abs.comp acc.comp add.comp add1.comp add_id.comp arange.comp argmax.comp argsort.comp argsort_large.comp ceil.comp clamp.comp concat.comp contig_copy.comp conv2d_dw.comp conv2d_mm.comp conv_transpose_1d.comp copy.comp copy_from_quant.comp copy_to_quant.comp copy_transpose.comp cos.comp count_equal.comp count_experts.comp cumsum.comp cumsum_multipass1.comp cumsum_multipass2.comp dequant_f32.comp dequant_funcs.glsl dequant_funcs_cm2.glsl dequant_head.glsl dequant_iq1_m.comp dequant_iq1_s.comp dequant_iq2_s.comp dequant_iq2_xs.comp dequant_iq2_xxs.comp dequant_iq3_s.comp dequant_iq3_xxs.comp dequant_iq4_nl.comp dequant_iq4_xs.comp dequant_mxfp4.comp dequant_q2_k.comp dequant_q3_k.comp dequant_q4_0.comp dequant_q4_1.comp dequant_q4_k.comp dequant_q5_0.comp dequant_q5_1.comp dequant_q5_k.comp dequant_q6_k.comp dequant_q8_0.comp diag.comp diag_mask_inf.comp div.comp exp.comp fill.comp flash_attn.comp flash_attn_base.glsl flash_attn_cm1.comp flash_attn_cm2.comp flash_attn_mask_opt.comp flash_attn_split_k_reduce.comp floor.comp geglu.comp geglu_erf.comp geglu_quick.comp gelu.comp gelu_erf.comp gelu_quick.comp generic_binary_head.glsl generic_head.glsl generic_unary_head.glsl get_rows.comp get_rows_quant.comp glu_head.glsl glu_main.glsl group_norm.comp hardsigmoid.comp hardswish.comp im2col.comp im2col_3d.comp l2_norm.comp leaky_relu.comp log.comp mul.comp mul_mat_split_k_reduce.comp mul_mat_vec.comp mul_mat_vec_base.glsl mul_mat_vec_iface.glsl mul_mat_vec_iq1_m.comp mul_mat_vec_iq1_s.comp mul_mat_vec_iq2_s.comp mul_mat_vec_iq2_xs.comp mul_mat_vec_iq2_xxs.comp mul_mat_vec_iq3_s.comp mul_mat_vec_iq3_xxs.comp mul_mat_vec_nc.comp mul_mat_vec_p021.comp mul_mat_vec_q2_k.comp mul_mat_vec_q3_k.comp mul_mat_vec_q4_k.comp mul_mat_vec_q5_k.comp mul_mat_vec_q6_k.comp mul_mat_vecq.comp mul_mat_vecq_funcs.glsl mul_mm.comp mul_mm_cm2.comp mul_mm_funcs.glsl mul_mm_id_funcs.glsl mul_mmq.comp mul_mmq_funcs.glsl mul_mmq_shmem_types.glsl multi_add.comp neg.comp norm.comp opt_step_adamw.comp opt_step_sgd.comp pad.comp pool2d.comp quantize_q8_1.comp reglu.comp relu.comp repeat.comp repeat_back.comp rms_norm.comp rms_norm_back.comp rms_norm_partials.comp roll.comp rope_funcs.glsl rope_head.glsl rope_multi.comp rope_neox.comp rope_norm.comp rope_params.glsl rope_vision.comp round.comp rte.glsl scale.comp sigmoid.comp silu.comp silu_back.comp sin.comp soft_max.comp soft_max_back.comp soft_max_large1.comp soft_max_large2.comp soft_max_large3.comp soft_max_large_common.glsl softplus.comp solve_tri.comp sqrt.comp square.comp ssm_conv.comp ssm_scan.comp step.comp sub.comp sum_rows.comp sum_rows.glsl swiglu.comp swiglu_oai.comp tanh.comp timestep_embedding.comp topk_argsort.comp topk_moe.comp topk_nary_search.comp tri.comp trunc.comp types.glsl upscale.comp utils.glsl vulkan-shaders-gen.cpp wkv6.comp wkv7.comp xielu.comp
CMakeLists.txt ggml-vulkan.cpp
ggml-webgpu
wgsl-shaders argmax.wgsl argsort.wgsl argsort_merge.wgsl binary.wgsl common_decls.tmpl cpy.tmpl.wgsl cumsum.wgsl embed_wgsl.py flash_attn.wgsl get_rows.tmpl.wgsl glu.tmpl.wgsl memset.wgsl mul_mat.tmpl.wgsl mul_mat_decls.tmpl mul_mat_reg_tile.tmpl.wgsl mul_mat_subgroup_matrix.tmpl.wgsl mul_mat_vec.tmpl.wgsl pad.wgsl rms_norm.wgsl rope.tmpl.wgsl scale.tmpl.wgsl set_rows.wgsl soft_max.tmpl.wgsl sum_rows.wgsl unary.wgsl
CMakeLists.txt ggml-webgpu-shader-lib.hpp ggml-webgpu.cpp pre_wgsl.hpp
ggml-zdnn .gitignore CMakeLists.txt common.hpp ggml-zdnn.cpp mmf.cpp mmf.hpp utils.cpp utils.hpp
ggml-zendnn CMakeLists.txt ggml-zendnn.cpp
CMakeLists.txt ggml-alloc.c ggml-backend-dl.cpp ggml-backend-dl.h ggml-backend-impl.h ggml-backend-reg.cpp ggml-backend.cpp ggml-common.h ggml-impl.h ggml-opt.cpp ggml-quants.c ggml-quants.h ggml-threading.cpp ggml-threading.h ggml.c ggml.cpp gguf.cpp
.gitignore CMakeLists.txt
gguf-py
examples reader.py writer.py
gguf
scripts gguf_convert_endian.py gguf_dump.py gguf_editor_gui.py gguf_hash.py gguf_new_metadata.py gguf_set_metadata.py
__init__.py constants.py gguf.py gguf_reader.py gguf_writer.py lazy.py metadata.py py.typed quants.py tensor_mapping.py utility.py vocab.py
tests __init__.py test_metadata.py test_quants.py
LICENSE README.md pyproject.toml
grammars README.md arithmetic.gbnf c.gbnf chess.gbnf english.gbnf japanese.gbnf json.gbnf json_arr.gbnf list.gbnf
include llama-cpp.h llama.h
licenses LICENSE-jsonhpp
media llama0-banner.png llama0-logo.png llama1-banner.png llama1-icon-transparent.png llama1-icon-transparent.svg llama1-icon.png llama1-icon.svg llama1-logo.png llama1-logo.svg matmul.png matmul.svg
models
templates Apertus-8B-Instruct.jinja ByteDance-Seed-OSS.jinja CohereForAI-c4ai-command-r-plus-tool_use.jinja CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja GLM-4.6.jinja Kimi-K2-Instruct.jinja Kimi-K2-Thinking.jinja MiMo-VL.jinja MiniMax-M2.jinja Mistral-Small-3.2-24B-Instruct-2506.jinja NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja NVIDIA-Nemotron-Nano-v2.jinja NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja Qwen-QwQ-32B.jinja Qwen-Qwen2.5-7B-Instruct.jinja Qwen-Qwen3-0.6B.jinja Qwen3-Coder.jinja README.md deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja deepseek-ai-DeepSeek-V3.1.jinja fireworks-ai-llama-3-firefunction-v2.jinja google-gemma-2-2b-it.jinja ibm-granite-granite-3.3-2B-Instruct.jinja llama-cpp-deepseek-r1.jinja llama-cpp-lfm2.jinja llama-cpp-rwkv-world.jinja meetkai-functionary-medium-v3.1.jinja meetkai-functionary-medium-v3.2.jinja meta-llama-Llama-3.1-8B-Instruct.jinja meta-llama-Llama-3.2-3B-Instruct.jinja meta-llama-Llama-3.3-70B-Instruct.jinja microsoft-Phi-3.5-mini-instruct.jinja mistralai-Ministral-3-14B-Reasoning-2512.jinja mistralai-Mistral-Nemo-Instruct-2407.jinja moonshotai-Kimi-K2.jinja openai-gpt-oss-120b.jinja unsloth-Apriel-1.5.jinja unsloth-mistral-Devstral-Small-2507.jinja upstage-Solar-Open-100B.jinja
.editorconfig ggml-vocab-aquila.gguf ggml-vocab-baichuan.gguf ggml-vocab-bert-bge.gguf ggml-vocab-bert-bge.gguf.inp ggml-vocab-bert-bge.gguf.out ggml-vocab-command-r.gguf ggml-vocab-command-r.gguf.inp ggml-vocab-command-r.gguf.out ggml-vocab-deepseek-coder.gguf ggml-vocab-deepseek-coder.gguf.inp ggml-vocab-deepseek-coder.gguf.out ggml-vocab-deepseek-llm.gguf ggml-vocab-deepseek-llm.gguf.inp ggml-vocab-deepseek-llm.gguf.out ggml-vocab-falcon.gguf ggml-vocab-falcon.gguf.inp ggml-vocab-falcon.gguf.out ggml-vocab-gpt-2.gguf ggml-vocab-gpt-2.gguf.inp ggml-vocab-gpt-2.gguf.out ggml-vocab-gpt-neox.gguf ggml-vocab-llama-bpe.gguf ggml-vocab-llama-bpe.gguf.inp ggml-vocab-llama-bpe.gguf.out ggml-vocab-llama-spm.gguf ggml-vocab-llama-spm.gguf.inp ggml-vocab-llama-spm.gguf.out ggml-vocab-mpt.gguf ggml-vocab-mpt.gguf.inp ggml-vocab-mpt.gguf.out ggml-vocab-nomic-bert-moe.gguf ggml-vocab-phi-3.gguf ggml-vocab-phi-3.gguf.inp ggml-vocab-phi-3.gguf.out ggml-vocab-qwen2.gguf ggml-vocab-qwen2.gguf.inp ggml-vocab-qwen2.gguf.out ggml-vocab-refact.gguf ggml-vocab-refact.gguf.inp ggml-vocab-refact.gguf.out ggml-vocab-starcoder.gguf ggml-vocab-starcoder.gguf.inp ggml-vocab-starcoder.gguf.out
pocs
vdot CMakeLists.txt q8dot.cpp vdot.cpp
CMakeLists.txt
requirements requirements-all.txt requirements-compare-llama-bench.txt requirements-convert_hf_to_gguf.txt requirements-convert_hf_to_gguf_update.txt requirements-convert_legacy_llama.txt requirements-convert_llama_ggml_to_gguf.txt requirements-convert_lora_to_gguf.txt requirements-gguf_editor_gui.txt requirements-pydantic.txt requirements-server-bench.txt requirements-test-tokenizer-random.txt requirements-tool_bench.txt
scripts
apple validate-apps.sh validate-ios.sh validate-macos.sh validate-tvos.sh validate-visionos.sh
jinja jinja-tester.py requirements.txt
snapdragon
adb llama-cli.farf run-bench.sh run-cli.sh run-completion.sh run-mtmd.sh run-tool.sh
qdc
tests test_bench.py
readme.md requirements.txt
windows run-bench.ps1 run-cli.ps1 run-tool.ps1 setup-build.ps1
bench-models.sh build-info.sh check-requirements.sh compare-commits.sh compare-llama-bench.py compare-logprobs.py create_ops_docs.py debug-test.sh fetch_server_test_models.py gen-authors.sh gen-unicode-data.py get-flags.mk get-hellaswag.sh get-pg.sh get-wikitext-103.sh get-wikitext-2.sh get-winogrande.sh get_chat_template.py hf.sh install-oneapi.bat pr2wt.sh serve-static.js server-bench.py sync-ggml-am.sh sync-ggml.last sync-ggml.sh sync_vendor.py tool_bench.py tool_bench.sh verify-checksum-models.py xxd.cmake
src
models afmoe.cpp apertus.cpp arcee.cpp arctic.cpp arwkv7.cpp baichuan.cpp bailingmoe.cpp bailingmoe2.cpp bert.cpp bitnet.cpp bloom.cpp chameleon.cpp chatglm.cpp codeshell.cpp cogvlm.cpp cohere2-iswa.cpp command-r.cpp dbrx.cpp deci.cpp deepseek.cpp deepseek2.cpp dots1.cpp dream.cpp ernie4-5-moe.cpp ernie4-5.cpp exaone-moe.cpp exaone.cpp exaone4.cpp falcon-h1.cpp falcon.cpp gemma-embedding.cpp gemma.cpp gemma2-iswa.cpp gemma3.cpp gemma3n-iswa.cpp glm4-moe.cpp glm4.cpp gpt2.cpp gptneox.cpp granite-hybrid.cpp granite.cpp graph-context-mamba.cpp grok.cpp grovemoe.cpp hunyuan-dense.cpp hunyuan-moe.cpp internlm2.cpp jais.cpp jamba.cpp kimi-linear.cpp lfm2.cpp llada-moe.cpp llada.cpp llama-iswa.cpp llama.cpp maincoder.cpp mamba.cpp mimo2-iswa.cpp minicpm3.cpp minimax-m2.cpp mistral3.cpp models.h modern-bert.cpp mpt.cpp nemotron-h.cpp nemotron.cpp neo-bert.cpp olmo.cpp olmo2.cpp olmoe.cpp openai-moe-iswa.cpp openelm.cpp orion.cpp pangu-embedded.cpp phi2.cpp phi3.cpp plamo.cpp plamo2.cpp plamo3.cpp plm.cpp qwen.cpp qwen2.cpp qwen2moe.cpp qwen2vl.cpp qwen3.cpp qwen35.cpp qwen35moe.cpp qwen3moe.cpp qwen3next.cpp qwen3vl-moe.cpp qwen3vl.cpp refact.cpp rnd1.cpp rwkv6-base.cpp rwkv6.cpp rwkv6qwen2.cpp rwkv7-base.cpp rwkv7.cpp seed-oss.cpp smallthinker.cpp smollm3.cpp stablelm.cpp starcoder.cpp starcoder2.cpp step35-iswa.cpp t5-dec.cpp t5-enc.cpp wavtokenizer-dec.cpp xverse.cpp
CMakeLists.txt llama-adapter.cpp llama-adapter.h llama-arch.cpp llama-arch.h llama-batch.cpp llama-batch.h llama-chat.cpp llama-chat.h llama-context.cpp llama-context.h llama-cparams.cpp llama-cparams.h llama-grammar.cpp llama-grammar.h llama-graph.cpp llama-graph.h llama-hparams.cpp llama-hparams.h llama-impl.cpp llama-impl.h llama-io.cpp llama-io.h llama-kv-cache-iswa.cpp llama-kv-cache-iswa.h llama-kv-cache.cpp llama-kv-cache.h llama-kv-cells.h llama-memory-hybrid-iswa.cpp llama-memory-hybrid-iswa.h llama-memory-hybrid.cpp llama-memory-hybrid.h llama-memory-recurrent.cpp llama-memory-recurrent.h llama-memory.cpp llama-memory.h llama-mmap.cpp llama-mmap.h llama-model-loader.cpp llama-model-loader.h llama-model-saver.cpp llama-model-saver.h llama-model.cpp llama-model.h llama-quant.cpp llama-quant.h llama-sampler.cpp llama-sampler.h llama-vocab.cpp llama-vocab.h llama.cpp unicode-data.cpp unicode-data.h unicode.cpp unicode.h
tests
peg-parser simple-tokenize.cpp simple-tokenize.h test-basic.cpp test-gbnf-generation.cpp test-json-parser.cpp test-json-serialization.cpp test-unicode.cpp tests.h
.gitignore CMakeLists.txt get-model.cpp get-model.h run-json-schema-to-grammar.mjs test-alloc.cpp test-arg-parser.cpp test-autorelease.cpp test-backend-ops.cpp test-backend-sampler.cpp test-barrier.cpp test-c.c test-chat-parser.cpp test-chat-peg-parser.cpp test-chat-template.cpp test-chat.cpp test-double-float.cpp test-gbnf-validator.cpp test-gguf.cpp test-grammar-integration.cpp test-grammar-llguidance.cpp test-grammar-parser.cpp test-jinja.cpp test-json-partial.cpp test-json-schema-to-grammar.cpp test-llama-grammar.cpp test-log.cpp test-lora-conversion-inference.sh test-model-load-cancel.cpp test-mtmd-c-api.c test-opt.cpp test-peg-parser.cpp test-quantize-fns.cpp test-quantize-perf.cpp test-quantize-stats.cpp test-regex-partial.cpp test-rope.cpp test-sampling.cpp test-state-restore-fragmented.cpp test-thread-safety.cpp test-tokenizer-0.cpp test-tokenizer-0.py test-tokenizer-0.sh test-tokenizer-1-bpe.cpp test-tokenizer-1-spm.cpp test-tokenizer-random.py test-tokenizers-repo.sh testing.h
tools
batched-bench CMakeLists.txt README.md batched-bench.cpp
cli CMakeLists.txt README.md cli.cpp
completion CMakeLists.txt README.md completion.cpp
cvector-generator CMakeLists.txt README.md completions.txt cvector-generator.cpp mean.hpp negative.txt pca.hpp positive.txt
export-lora CMakeLists.txt README.md export-lora.cpp
fit-params CMakeLists.txt README.md fit-params.cpp
gguf-split CMakeLists.txt README.md gguf-split.cpp tests.sh
imatrix CMakeLists.txt README.md imatrix.cpp
llama-bench CMakeLists.txt README.md llama-bench.cpp
mtmd
legacy-models convert_image_encoder_to_gguf.py glmedge-convert-image-encoder-to-gguf.py glmedge-surgery.py llava_surgery.py llava_surgery_v2.py minicpmv-convert-image-encoder-to-gguf.py minicpmv-surgery.py
models cogvlm.cpp conformer.cpp glm4v.cpp internvl.cpp kimik25.cpp kimivl.cpp llama4.cpp llava.cpp minicpmv.cpp mobilenetv5.cpp models.h pixtral.cpp qwen2vl.cpp qwen3vl.cpp siglip.cpp whisper-enc.cpp youtuvl.cpp
CMakeLists.txt README.md clip-graph.h clip-impl.h clip-model.h clip.cpp clip.h deprecation-warning.cpp mtmd-audio.cpp mtmd-audio.h mtmd-cli.cpp mtmd-helper.cpp mtmd-helper.h mtmd.cpp mtmd.h requirements.txt test-1.jpeg test-2.mp3 tests.sh
perplexity CMakeLists.txt README.md perplexity.cpp
quantize CMakeLists.txt README.md quantize.cpp tests.sh
rpc CMakeLists.txt README.md rpc-server.cpp
server
bench README.md bench.py prometheus.yml requirements.txt script.js
public index.html.gz loading.html
public_legacy colorthemes.css completion.js favicon.ico index-new.html index.html index.js json-schema-to-grammar.mjs loading.html prompt-formats.js style.css system-prompts.js theme-beeninorder.css theme-ketivah.css theme-mangotango.css theme-playground.css theme-polarnight.css theme-snowstorm.css
public_simplechat datautils.mjs index.html readme.md simplechat.css simplechat.js simplechat_screens.webp ui.mjs
tests
unit test_basic.py test_chat_completion.py test_compat_anthropic.py test_compat_oai_responses.py test_completion.py test_ctx_shift.py test_embedding.py test_infill.py test_lora.py test_rerank.py test_router.py test_security.py test_sleep.py test_slot_save.py test_speculative.py test_template.py test_tokenize.py test_tool_call.py test_vision_api.py
.gitignore README.md conftest.py pytest.ini requirements.txt tests.sh utils.py
themes
buttons-top README.md buttons_top.png favicon.ico index.html
wild README.md favicon.ico index.html llama_cpp.png llamapattern.png wild.png
README.md
webui
.storybook ModeWatcherDecorator.svelte TooltipProviderDecorator.svelte main.ts preview.ts vitest.setup.ts
docs
architecture high-level-architecture-simplified.md high-level-architecture.md
flows chat-flow.md conversations-flow.md data-flow-simplified-model-mode.md data-flow-simplified-router-mode.md database-flow.md models-flow.md server-flow.md settings-flow.md
scripts dev.sh install-git-hooks.sh post-build.sh
src
lib
components
app
chat
ChatAttachments ChatAttachmentPreview.svelte ChatAttachmentThumbnailFile.svelte ChatAttachmentThumbnailImage.svelte ChatAttachmentsList.svelte ChatAttachmentsViewAll.svelte
ChatForm
ChatFormActions ChatFormActionFileAttachments.svelte ChatFormActionRecord.svelte ChatFormActionSubmit.svelte ChatFormActions.svelte
ChatForm.svelte ChatFormFileInputInvisible.svelte ChatFormHelperText.svelte ChatFormTextarea.svelte
ChatMessages ChatMessage.svelte ChatMessageActions.svelte ChatMessageAssistant.svelte ChatMessageBranchingControls.svelte ChatMessageEditForm.svelte ChatMessageStatistics.svelte ChatMessageSystem.svelte ChatMessageThinkingBlock.svelte ChatMessageUser.svelte ChatMessages.svelte
ChatScreen ChatScreen.svelte ChatScreenDragOverlay.svelte ChatScreenHeader.svelte ChatScreenProcessingInfo.svelte
ChatSettings ChatSettings.svelte ChatSettingsFields.svelte ChatSettingsFooter.svelte ChatSettingsImportExportTab.svelte ChatSettingsParameterSourceIndicator.svelte
ChatSidebar ChatSidebar.svelte ChatSidebarActions.svelte ChatSidebarConversationItem.svelte ChatSidebarSearch.svelte handle-mobile-sidebar-item-click.ts
dialogs DialogChatAttachmentPreview.svelte DialogChatAttachmentsViewAll.svelte DialogChatError.svelte DialogChatSettings.svelte DialogConfirmation.svelte DialogConversationSelection.svelte DialogConversationTitleUpdate.svelte DialogEmptyFileAlert.svelte DialogModelInformation.svelte DialogModelNotAvailable.svelte
misc ActionButton.svelte ActionDropdown.svelte BadgeChatStatistic.svelte BadgeInfo.svelte BadgeModality.svelte CodePreviewDialog.svelte ConversationSelection.svelte CopyToClipboardIcon.svelte KeyboardShortcutInfo.svelte MarkdownContent.svelte RemoveButton.svelte SearchInput.svelte SyntaxHighlightedCode.svelte
models ModelBadge.svelte ModelsSelector.svelte
server ServerErrorSplash.svelte ServerLoadingSplash.svelte ServerStatus.svelte
index.ts
ui
alert alert-description.svelte alert-title.svelte alert.svelte index.ts
alert-dialog alert-dialog-action.svelte alert-dialog-cancel.svelte alert-dialog-content.svelte alert-dialog-description.svelte alert-dialog-footer.svelte alert-dialog-header.svelte alert-dialog-overlay.svelte alert-dialog-title.svelte alert-dialog-trigger.svelte index.ts
badge badge.svelte index.ts
button button.svelte index.ts
card card-action.svelte card-content.svelte card-description.svelte card-footer.svelte card-header.svelte card-title.svelte card.svelte index.ts
checkbox checkbox.svelte index.ts
collapsible collapsible-content.svelte collapsible-trigger.svelte collapsible.svelte index.ts
dialog dialog-close.svelte dialog-content.svelte dialog-description.svelte dialog-footer.svelte dialog-header.svelte dialog-overlay.svelte dialog-title.svelte dialog-trigger.svelte index.ts
dropdown-menu dropdown-menu-checkbox-item.svelte dropdown-menu-content.svelte dropdown-menu-group-heading.svelte dropdown-menu-group.svelte dropdown-menu-item.svelte dropdown-menu-label.svelte dropdown-menu-radio-group.svelte dropdown-menu-radio-item.svelte dropdown-menu-separator.svelte dropdown-menu-shortcut.svelte dropdown-menu-sub-content.svelte dropdown-menu-sub-trigger.svelte dropdown-menu-trigger.svelte index.ts
input index.ts input.svelte
label index.ts label.svelte
popover index.ts popover-close.svelte popover-content.svelte popover-portal.svelte popover-trigger.svelte popover.svelte
scroll-area index.ts scroll-area-scrollbar.svelte scroll-area.svelte
select index.ts select-content.svelte select-group-heading.svelte select-group.svelte select-item.svelte select-label.svelte select-scroll-down-button.svelte select-scroll-up-button.svelte select-separator.svelte select-trigger.svelte
separator index.ts separator.svelte
sheet index.ts sheet-close.svelte sheet-content.svelte sheet-description.svelte sheet-footer.svelte sheet-header.svelte sheet-overlay.svelte sheet-title.svelte sheet-trigger.svelte
sidebar constants.ts context.svelte.ts index.ts sidebar-content.svelte sidebar-footer.svelte sidebar-group-action.svelte sidebar-group-content.svelte sidebar-group-label.svelte sidebar-group.svelte sidebar-header.svelte sidebar-input.svelte sidebar-inset.svelte sidebar-menu-action.svelte sidebar-menu-badge.svelte sidebar-menu-button.svelte sidebar-menu-item.svelte sidebar-menu-skeleton.svelte sidebar-menu-sub-button.svelte sidebar-menu-sub-item.svelte sidebar-menu-sub.svelte sidebar-menu.svelte sidebar-provider.svelte sidebar-rail.svelte sidebar-separator.svelte sidebar-trigger.svelte sidebar.svelte
skeleton index.ts skeleton.svelte
switch index.ts switch.svelte
table index.ts table-body.svelte table-caption.svelte table-cell.svelte table-footer.svelte table-head.svelte table-header.svelte table-row.svelte table.svelte
textarea index.ts textarea.svelte
tooltip index.ts tooltip-content.svelte tooltip-trigger.svelte
utils.ts
constants auto-scroll.ts binary-detection.ts default-context.ts floating-ui-constraints.ts icons.ts input-classes.ts latex-protection.ts literal-html.ts localstorage-keys.ts max-bundle-size.ts precision.ts processing-info.ts settings-config.ts supported-file-types.ts table-html-restorer.ts tooltip-config.ts viewport.ts
enums attachment.ts chat.ts files.ts index.ts model.ts server.ts
hooks is-mobile.svelte.ts use-model-change-validation.svelte.ts use-processing-state.svelte.ts
markdown enhance-code-blocks.ts enhance-links.ts literal-html.ts table-html-restorer.ts
services chat.ts database.ts index.ts models.ts parameter-sync.spec.ts parameter-sync.ts props.ts
stores chat.svelte.ts conversations.svelte.ts models.svelte.ts persisted.svelte.ts server.svelte.ts settings.svelte.ts
types api.d.ts chat.d.ts database.d.ts index.ts models.d.ts settings.d.ts
utils api-headers.ts api-key-validation.ts attachment-display.ts attachment-type.ts audio-recording.ts autoresize-textarea.ts branching.ts browser-only.ts clipboard.ts config-helpers.ts conversation-utils.ts convert-files-to-extra.ts file-preview.ts file-type.ts formatters.ts index.ts is-ime-composing.ts latex-protection.ts modality-file-validation.ts model-names.ts pdf-processing.ts portal-to-body.ts precision.ts process-uploaded-files.ts svg-to-png.ts syntax-highlight-language.ts text-files.ts text.ts webp-to-png.ts
routes
chat
[id] +page.svelte +page.ts
+error.svelte +layout.svelte +page.svelte +page.ts
styles katex-custom.scss
app.css app.d.ts app.html
static favicon.svg loading.html
tests
client
components TestWrapper.svelte
page.svelte.test.ts
e2e demo.test.ts
stories
fixtures
assets 1.jpg beautiful-flowers-lotus.webp example.pdf hf-logo.svg
ai-tutorial.ts api-docs.ts blog-post.ts data-analysis.ts empty.ts math-formulas.ts readme.ts storybook-mocks.ts
ChatForm.stories.svelte ChatMessage.stories.svelte ChatSettings.stories.svelte ChatSidebar.stories.svelte Introduction.mdx MarkdownContent.stories.svelte
unit clipboard.test.ts latex-protection.test.ts model-names.test.ts
.gitignore .npmrc .prettierignore .prettierrc README.md components.json eslint.config.js package-lock.json package.json playwright.config.ts svelte.config.js tsconfig.json vite.config.ts vitest-setup-client.ts
CMakeLists.txt README-dev.md README.md chat-llama2.sh chat.mjs chat.sh server-common.cpp server-common.h server-context.cpp server-context.h server-http.cpp server-http.h server-models.cpp server-models.h server-queue.cpp server-queue.h server-task.cpp server-task.h server.cpp
tokenize CMakeLists.txt tokenize.cpp
tts CMakeLists.txt README.md convert_pt_to_hf.py tts-outetts.py tts.cpp
CMakeLists.txt
vendor
cpp-httplib CMakeLists.txt LICENSE httplib.cpp httplib.h
miniaudio miniaudio.h
nlohmann json.hpp json_fwd.hpp
sheredom subprocess.h
stb stb_image.h
.clang-format .clang-tidy .dockerignore .ecrc .editorconfig .flake8 .gitignore .gitmodules .pre-commit-config.yaml AGENTS.md AUTHORS CLAUDE.md CMakeLists.txt CMakePresets.json CODEOWNERS CONTRIBUTING.md LICENSE Makefile README.md SECURITY.md convert_hf_to_gguf.py convert_hf_to_gguf_update.py convert_llama_ggml_to_gguf.py convert_lora_to_gguf.py flake.lock flake.nix mypy.ini poetry.lock pyproject.toml pyrightconfig.json requirements.txt
maps map1.h map1.txt
papers 2310.11703v2.pdf 2405.14159v2.pdf
prompts lotr.h lotr.txt
.gitignore Dockerfile Makefile README.md compile_flags.txt context.c game.c makext.mk mapeditor.html maps.h minunit.h models.h models.txt nonstd.h npc.c termbox2.h vectordb.c vectordb.h
llama.cpp/src/llama-model.cpp raw
   1#include "llama-model.h"
   2
   3#include "llama-impl.h"
   4#include "llama-mmap.h"
   5#include "llama-cparams.h"
   6#include "llama-model-loader.h"
   7
   8#include "llama-kv-cache.h"
   9#include "llama-kv-cache-iswa.h"
  10#include "llama-memory-hybrid.h"
  11#include "llama-memory-hybrid-iswa.h"
  12#include "llama-memory-recurrent.h"
  13
  14#include "ggml-cpp.h"
  15
  16#include "models/models.h"
  17
  18#include <algorithm>
  19#include <cassert>
  20#include <cfloat>
  21#include <cstring>
  22#include <cmath>
  23#include <functional>
  24#include <map>
  25#include <regex>
  26#include <sstream>
  27#include <stdexcept>
  28
  29const char * llm_type_name(llm_type type) {
  30    switch (type) {
  31        case LLM_TYPE_14M:           return "14M";
  32        case LLM_TYPE_17M:           return "17M";
  33        case LLM_TYPE_22M:           return "22M";
  34        case LLM_TYPE_33M:           return "33M";
  35        case LLM_TYPE_47M:           return "47M";
  36        case LLM_TYPE_60M:           return "60M";
  37        case LLM_TYPE_70M:           return "70M";
  38        case LLM_TYPE_80M:           return "80M";
  39        case LLM_TYPE_109M:          return "109M";
  40        case LLM_TYPE_137M:          return "137M";
  41        case LLM_TYPE_140M:          return "140M";
  42        case LLM_TYPE_149M:          return "149M";
  43        case LLM_TYPE_160M:          return "160M";
  44        case LLM_TYPE_190M:          return "190M";
  45        case LLM_TYPE_220M:          return "220M";
  46        case LLM_TYPE_250M:          return "250M";
  47        case LLM_TYPE_256M:          return "256M";
  48        case LLM_TYPE_270M:          return "270M";
  49        case LLM_TYPE_335M:          return "335M";
  50        case LLM_TYPE_350M:          return "350M";
  51        case LLM_TYPE_360M:          return "360M";
  52        case LLM_TYPE_395M:          return "395M";
  53        case LLM_TYPE_410M:          return "410M";
  54        case LLM_TYPE_450M:          return "450M";
  55        case LLM_TYPE_475M:          return "475M";
  56        case LLM_TYPE_558M:          return "558M";
  57        case LLM_TYPE_700M:          return "700M";
  58        case LLM_TYPE_770M:          return "770M";
  59        case LLM_TYPE_780M:          return "780M";
  60        case LLM_TYPE_950M:          return "950M";
  61        case LLM_TYPE_0_3B:          return "0.3B";
  62        case LLM_TYPE_0_5B:          return "0.5B";
  63        case LLM_TYPE_0_6B:          return "0.6B";
  64        case LLM_TYPE_1B:            return "1B";
  65        case LLM_TYPE_1_2B:          return "1.2B";
  66        case LLM_TYPE_1_3B:          return "1.3B";
  67        case LLM_TYPE_1_4B:          return "1.4B";
  68        case LLM_TYPE_1_5B:          return "1.5B";
  69        case LLM_TYPE_1_6B:          return "1.6B";
  70        case LLM_TYPE_1_7B:          return "1.7B";
  71        case LLM_TYPE_1_8B:          return "1.8B";
  72        case LLM_TYPE_2B:            return "2B";
  73        case LLM_TYPE_2_6B:          return "2.6B";
  74        case LLM_TYPE_2_8B:          return "2.8B";
  75        case LLM_TYPE_2_9B:          return "2.9B";
  76        case LLM_TYPE_3B:            return "3B";
  77        case LLM_TYPE_4B:            return "4B";
  78        case LLM_TYPE_6B:            return "6B";
  79        case LLM_TYPE_6_9B:          return "6.9B";
  80        case LLM_TYPE_7B:            return "7B";
  81        case LLM_TYPE_8B:            return "8B";
  82        case LLM_TYPE_9B:            return "9B";
  83        case LLM_TYPE_11B:           return "11B";
  84        case LLM_TYPE_12B:           return "12B";
  85        case LLM_TYPE_13B:           return "13B";
  86        case LLM_TYPE_14B:           return "14B";
  87        case LLM_TYPE_15B:           return "15B";
  88        case LLM_TYPE_16B:           return "16B";
  89        case LLM_TYPE_20B:           return "20B";
  90        case LLM_TYPE_26B:           return "26B";
  91        case LLM_TYPE_27B:           return "27B";
  92        case LLM_TYPE_30B:           return "30B";
  93        case LLM_TYPE_32B:           return "32B";
  94        case LLM_TYPE_34B:           return "34B";
  95        case LLM_TYPE_35B:           return "35B";
  96        case LLM_TYPE_36B:           return "36B";
  97        case LLM_TYPE_40B:           return "40B";
  98        case LLM_TYPE_65B:           return "65B";
  99        case LLM_TYPE_70B:           return "70B";
 100        case LLM_TYPE_120B:          return "120B";
 101        case LLM_TYPE_142B:          return "142B";
 102        case LLM_TYPE_236B:          return "236B";
 103        case LLM_TYPE_290B:          return "290B";
 104        case LLM_TYPE_314B:          return "314B";
 105        case LLM_TYPE_405B:          return "405B";
 106        case LLM_TYPE_671B:          return "671B";
 107        case LLM_TYPE_SMALL:         return "0.1B";
 108        case LLM_TYPE_MEDIUM:        return "0.4B";
 109        case LLM_TYPE_LARGE:         return "0.8B";
 110        case LLM_TYPE_XL:            return "1.5B";
 111        case LLM_TYPE_A1_7B:         return "A1.7B";
 112        case LLM_TYPE_A2_7B:         return "A2.7B";
 113        case LLM_TYPE_8x7B:          return "8x7B";
 114        case LLM_TYPE_8x22B:         return "8x22B";
 115        case LLM_TYPE_16x12B:        return "16x12B";
 116        case LLM_TYPE_16x3_8B:       return "16x3.8B";
 117        case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
 118        case LLM_TYPE_57B_A14B:      return "57B.A14B";
 119        case LLM_TYPE_17B_16E:       return "17Bx16E (Scout)";
 120        case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
 121        case LLM_TYPE_A13B:          return "A13B";
 122        case LLM_TYPE_7B_A1B:        return "7B.A1B";
 123        case LLM_TYPE_8B_A1B:        return "8B.A1B";
 124        case LLM_TYPE_16B_A1B:       return "16B.A1B";
 125        case LLM_TYPE_21B_A3B:       return "21B.A3B";
 126        case LLM_TYPE_30B_A3B:       return "30B.A3B";
 127        case LLM_TYPE_31B_A3_5B:     return "31B.A3.5B";
 128        case LLM_TYPE_35B_A3B:       return "35B.A3B";
 129        case LLM_TYPE_48B_A3B:       return "48B.A3B";
 130        case LLM_TYPE_80B_A3B:       return "80B.A3B";
 131        case LLM_TYPE_100B_A6B:      return "100B.A6B";
 132        case LLM_TYPE_102B_A12B:     return "102B.A12B";
 133        case LLM_TYPE_106B_A12B:     return "106B.A12B";
 134        case LLM_TYPE_196B_A11B:     return "196B.A11B";
 135        case LLM_TYPE_230B_A10B:     return "230B.A10B";
 136        case LLM_TYPE_235B_A22B:     return "235B.A22B";
 137        case LLM_TYPE_300B_A47B:     return "300B.A47B";
 138        case LLM_TYPE_310B_A15B:     return "310B.A15B";
 139        case LLM_TYPE_355B_A32B:     return "355B.A32B";
 140        case LLM_TYPE_E2B:           return "E2B";
 141        case LLM_TYPE_E4B:           return "E4B";
 142        default:                     return "?B";
 143    }
 144}
 145
 146static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
 147    switch (type) {
 148        case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
 149        case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
 150        default:                                    return "unknown";
 151    }
 152}
 153
 154static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
 155    { LLAMA_ROPE_SCALING_TYPE_NONE,       "none"       },
 156    { LLAMA_ROPE_SCALING_TYPE_LINEAR,     "linear"     },
 157    { LLAMA_ROPE_SCALING_TYPE_YARN,       "yarn"       },
 158    { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
 159};
 160
 161std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
 162    return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
 163}
 164
 165static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
 166    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
 167        if (kv.second == name) {
 168            return (llama_rope_scaling_type) kv.first;
 169        }
 170    }
 171
 172    return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
 173}
 174
 175// checks if the weight tensor can be used with the specified buffer type and device
 176static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
 177    GGML_ASSERT(w != nullptr);
 178
 179    if (op == GGML_OP_NONE) {
 180        return true;
 181    }
 182
 183    ggml_init_params params = {
 184        /*.mem_size   =*/ ggml_tensor_overhead()*8,
 185        /*.mem_buffer =*/ NULL,
 186        /*.no_alloc   =*/ true,
 187    };
 188    ggml_context_ptr ctx_ptr { ggml_init(params) };
 189    if (!ctx_ptr) {
 190        throw std::runtime_error(format("failed to create ggml context"));
 191    }
 192    ggml_context * ctx = ctx_ptr.get();
 193
 194    ggml_tensor * op_tensor = nullptr;
 195
 196    switch (op) {
 197        case GGML_OP_GET_ROWS:
 198            {
 199                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
 200                op_tensor = ggml_get_rows(ctx, w, b);
 201            } break;
 202        case GGML_OP_MUL_MAT:
 203            {
 204                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
 205                op_tensor = ggml_mul_mat(ctx, w, b);
 206            } break;
 207        case GGML_OP_MUL_MAT_ID:
 208            {
 209                int n_expert_used = hparams.n_expert_used;
 210                ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
 211                ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
 212                op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
 213            } break;
 214        case GGML_OP_ADD:
 215            {
 216                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
 217                op_tensor = ggml_add(ctx, a, w);
 218            } break;
 219        case GGML_OP_ADD_ID:
 220            {
 221                int n_expert_used = hparams.n_expert_used;
 222                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
 223                ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
 224                op_tensor = ggml_add_id(ctx, a, w, c);
 225            } break;
 226        case GGML_OP_MUL:
 227            {
 228                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
 229                op_tensor = ggml_mul(ctx, a, w);
 230            } break;
 231        case GGML_OP_DIV:
 232            {
 233                ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
 234                op_tensor = ggml_div(ctx, a, w);
 235            } break;
 236        case GGML_OP_ROPE:
 237            {
 238                int n_embd_head = hparams.n_embd_head_v;
 239                int n_head = hparams.n_head();
 240                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
 241                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
 242                op_tensor = ggml_rope_ext(
 243                    ctx, a, b, w,
 244                    0, 0, 0, 0, 0,
 245                    0, 0, 0, 0
 246                );
 247
 248            } break;
 249        case GGML_OP_SSM_CONV:
 250            {
 251                const int64_t n_seq_tokens = 512;
 252                const int64_t n_seqs       = 3;
 253                ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
 254                op_tensor = ggml_ssm_conv(ctx, conv_x, w);
 255            } break;
 256        case GGML_OP_SSM_SCAN:
 257            {
 258                // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
 259                const int64_t d_state      = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
 260                const int64_t n_head       = w->ne[1];
 261                const int64_t head_dim     = hparams.ssm_d_inner / n_head;
 262                const int64_t n_group      = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
 263                const int64_t n_seq_tokens = 512;
 264                const int64_t n_seqs       = 3;
 265                ggml_tensor * s   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
 266                ggml_tensor * x   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
 267                ggml_tensor * dt  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
 268                ggml_tensor * B   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
 269                ggml_tensor * C   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
 270                ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
 271                op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
 272            } break;
 273        case GGML_OP_RWKV_WKV6:
 274            {
 275                // FIXME
 276                const int64_t S = 123;
 277                const int64_t H = 123;
 278                const int64_t n_tokens = 123;
 279                const int64_t n_seqs = 123;
 280                ggml_tensor  * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
 281                ggml_tensor  * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
 282                ggml_tensor  * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
 283                ggml_tensor  * tf = w;
 284                ggml_tensor  * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
 285                ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
 286                op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
 287            } break;
 288        case GGML_OP_IM2COL:
 289            {
 290                const int n_embd_inp = hparams.n_embd_inp();
 291                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
 292                op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
 293            } break;
 294        case GGML_OP_SCALE:
 295            {
 296                op_tensor = ggml_scale(ctx, w, 1.0f);
 297            } break;
 298        default:
 299            GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
 300    }
 301
 302    // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
 303    GGML_ASSERT(w->buffer == nullptr);
 304    w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
 305    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
 306    ggml_backend_buffer_free(w->buffer);
 307    w->buffer = nullptr;
 308
 309    return op_supported;
 310}
 311
 312// lists of buffer types used for each layer
 313using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
 314
 315// find the first buffer type in the list that can use the tensor
 316static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
 317    GGML_ASSERT(!buft_list.empty());
 318    for (const auto & cur : buft_list) {
 319        ggml_backend_dev_t cur_dev = cur.first;
 320        ggml_backend_buffer_type_t cur_buft = cur.second;
 321        if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
 322            return cur_buft;
 323        }
 324    }
 325
 326    return nullptr;
 327}
 328
 329// CPU: ACCEL -> GPU host -> CPU extra -> CPU
 330static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
 331    buft_list_t buft_list;
 332
 333    // add ACCEL buffer types
 334    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
 335        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
 336        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
 337            auto * buft = ggml_backend_dev_buffer_type(dev);
 338            // skip
 339            if (buft != ggml_backend_cpu_buffer_type()) {
 340                buft_list.emplace_back(dev, buft);
 341            }
 342        }
 343    }
 344
 345    // add a host buffer type
 346    // storing the tensors in a host buffer is useful when the processing of large batches
 347    // is offloaded to a GPU device, since it reduces the time spent on data transfers
 348    // generally, this will be done using the first device in the list
 349    // a better approach would be to handle this on a weight-by-weight basis using the offload_op
 350    // function of the device to determine if it would benefit from being stored in a host buffer
 351    if (!no_host) {
 352        for (auto * dev : devices) {
 353            ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
 354            if (buft) {
 355                buft_list.emplace_back(dev, buft);
 356                break;
 357            }
 358        }
 359    }
 360
 361    // add extra buffer types
 362    if (use_extra_bufts) {
 363        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
 364        if (cpu_dev == nullptr) {
 365            throw std::runtime_error(format("%s: no CPU backend found", __func__));
 366        }
 367
 368        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
 369        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
 370            ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
 371        if (ggml_backend_dev_get_extra_bufts_fn) {
 372            ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
 373            while (extra_bufts && *extra_bufts) {
 374                buft_list.emplace_back(cpu_dev, *extra_bufts);
 375                ++extra_bufts;
 376            }
 377        }
 378    }
 379
 380    // add the CPU buffer type
 381    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
 382        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
 383        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
 384            buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
 385        }
 386    }
 387
 388    return buft_list;
 389}
 390
 391// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
 392static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
 393    buft_list_t buft_list;
 394
 395    // add the device split buffer type if requested and available
 396    if (split_mode == LLAMA_SPLIT_MODE_ROW) {
 397        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
 398        auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
 399            ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
 400        if (ggml_backend_split_buffer_type_fn) {
 401            size_t dev_index = [&]() {
 402                auto * reg = ggml_backend_dev_backend_reg(dev);
 403                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
 404                    if (ggml_backend_reg_dev_get(reg, i) == dev) {
 405                        return i;
 406                    }
 407                }
 408                throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
 409            }();
 410            auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
 411            if (buft != nullptr) {
 412                buft_list.emplace_back(dev, buft);
 413            }
 414        }
 415    }
 416
 417    // add the device default buffer type
 418    buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
 419
 420    // add the device extra buffer type (if any)
 421    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
 422    auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
 423        ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
 424
 425    if (ggml_backend_dev_get_extra_bufts_fn) {
 426        ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
 427        while (extra_bufts && *extra_bufts) {
 428            buft_list.emplace_back(dev, *extra_bufts);
 429            ++extra_bufts;
 430        }
 431    }
 432
 433    return buft_list;
 434}
 435
 436struct llama_model::impl {
 437    impl() = default;
 438    ~impl() = default;
 439
 440    uint64_t n_elements = 0;
 441
 442    size_t n_bytes = 0;
 443
 444    std::string desc_str;
 445
 446    // model memory mapped files
 447    llama_mmaps mappings;
 448
 449    // objects representing data potentially being locked in memory
 450    llama_mlocks mlock_bufs;
 451    llama_mlocks mlock_mmaps;
 452
 453    // contexts where the model tensors metadata is stored as well as the corresponding buffers:
 454    std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
 455
 456    buft_list_t cpu_buft_list;
 457    std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
 458
 459    struct layer_dev {
 460        ggml_backend_dev_t dev;
 461        buft_list_t * buft_list;
 462    };
 463
 464    layer_dev dev_input = {};
 465    layer_dev dev_output = {};
 466    std::vector<layer_dev> dev_layer;
 467
 468    bool has_tensor_overrides;
 469};
 470
 471llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
 472    pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
 473}
 474
 475llama_model::~llama_model() {
 476    for (auto * lora : loras) {
 477        delete lora;
 478    }
 479}
 480
 481void llama_model::load_stats(llama_model_loader & ml) {
 482    pimpl->n_elements = ml.n_elements;
 483    pimpl->n_bytes = ml.n_bytes;
 484}
 485
 486void llama_model::load_arch(llama_model_loader & ml) {
 487    arch = ml.get_arch();
 488    if (arch == LLM_ARCH_UNKNOWN) {
 489        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
 490    }
 491}
 492
 493void llama_model::load_hparams(llama_model_loader & ml) {
 494    const gguf_context * ctx = ml.meta.get();
 495
 496    // get metadata as string
 497    for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
 498        gguf_type type = gguf_get_kv_type(ctx, i);
 499        if (type == GGUF_TYPE_ARRAY) {
 500            continue;
 501        }
 502        const char * name = gguf_get_key(ctx, i);
 503        const std::string value = gguf_kv_to_str(ctx, i);
 504        gguf_kv.emplace(name, value);
 505    }
 506
 507    // get general kv
 508    ml.get_key(LLM_KV_GENERAL_NAME, name, false);
 509
 510    // everything past this point is not vocab-related
 511    // for CLIP models, we only need to load tensors, no hparams
 512    if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
 513        return;
 514    }
 515
 516    ml.get_key(LLM_KV_CONTEXT_LENGTH,          hparams.n_ctx_train);
 517    ml.get_key(LLM_KV_EMBEDDING_LENGTH,        hparams.n_embd);
 518    ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT,    hparams.n_embd_out_impl, false);
 519    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
 520    ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
 521    ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
 522    ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,      hparams.n_expert_groups, false);
 523    ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used,    false);
 524
 525    if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
 526        ml.get_key(LLM_KV_FEATURES_LENGTH,  hparams.n_embd);
 527        ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
 528
 529        ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
 530        ml.get_key(LLM_KV_POSNET_BLOCK_COUNT,      hparams.posnet.n_layer);
 531
 532        ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
 533        ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT,      hparams.convnext.n_layer);
 534    }
 535
 536    GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
 537    GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
 538    if (hparams.n_expert > 0) {
 539        GGML_ASSERT(hparams.n_expert_used > 0);
 540        GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
 541        if (hparams.n_expert_groups > 1) {
 542            GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
 543            GGML_ASSERT(hparams.n_group_used > 0);
 544            GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
 545        }
 546    } else {
 547        GGML_ASSERT(hparams.n_expert_used == 0);
 548        GGML_ASSERT(hparams.n_expert_groups == 0);
 549    }
 550
 551    std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
 552    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
 553    std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
 554    std::fill(
 555        hparams.recurrent_layer_arr.begin(),
 556        hparams.recurrent_layer_arr.end(),
 557        llm_arch_is_recurrent(ml.get_arch()));
 558
 559    std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
 560    std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
 561
 562    std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
 563    std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
 564    std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
 565    std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
 566    std::fill(hparams.swiglu_clamp_exp.begin(),   hparams.swiglu_clamp_exp.end(),   0.0f);
 567    std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
 568
 569    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
 570    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
 571
 572    // n_head_kv is optional, default to n_head
 573    hparams.n_head_kv_arr = hparams.n_head_arr;
 574
 575    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
 576
 577    bool rope_finetuned = false;
 578    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
 579    hparams.rope_finetuned = rope_finetuned;
 580
 581    hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
 582    ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
 583
 584    // rope_freq_base (optional)
 585    hparams.rope_freq_base_train = 10000.0f;
 586    ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
 587
 588    std::string rope_scaling("linear");
 589    ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
 590    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
 591    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
 592
 593    // TODO: Handle SWA metadata similarly when models start implementing it
 594    // rope_freq_scale (inverse of the kv) is optional
 595    float ropescale = 0.0f;
 596    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
 597        // try the old key name
 598        ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
 599    }
 600    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
 601
 602    ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
 603
 604    // non-transformer models do not have attention heads
 605    if (hparams.n_head() > 0) {
 606        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
 607        // gpt-j n_rot = rotary_dim
 608
 609        hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
 610        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
 611
 612        hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
 613        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
 614
 615        // sanity check for n_rot (optional)
 616        hparams.n_rot = hparams.n_embd_head_k;
 617
 618        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 619
 620        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
 621            if (hparams.n_rot != hparams.n_embd_head_k) {
 622                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
 623            }
 624        }
 625    } else {
 626        hparams.n_rot = 0;
 627        hparams.n_embd_head_k = 0;
 628        hparams.n_embd_head_v = 0;
 629    }
 630
 631    // for differentiating model types
 632    uint32_t n_vocab = 0;
 633    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
 634
 635    // for classifier models
 636    ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
 637    if (!classifier_labels.empty()) {
 638        hparams.n_cls_out = classifier_labels.size();
 639    }
 640
 641    // arch-specific KVs
 642    switch (arch) {
 643        case LLM_ARCH_LLAMA:
 644        case LLM_ARCH_LLAMA_EMBED:
 645            {
 646                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 647
 648                if (hparams.n_expert == 8) {
 649                    switch (hparams.n_layer) {
 650                        case 32: type = LLM_TYPE_8x7B; break;
 651                        case 56: type = LLM_TYPE_8x22B; break;
 652                        default: type = LLM_TYPE_UNKNOWN;
 653                    }
 654                } else {
 655                    switch (hparams.n_layer) {
 656                        case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
 657                        case 22: type = LLM_TYPE_1B; break;
 658                        case 26: type = LLM_TYPE_3B; break;
 659                        case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
 660                        case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
 661                        // granite uses a vocab with len 49152
 662                        case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
 663                        case 36: type = LLM_TYPE_8B; break; // granite
 664                        case 40: type = LLM_TYPE_13B; break;
 665                        case 48: type = LLM_TYPE_34B; break;
 666                        case 60: type = LLM_TYPE_30B; break;
 667                        case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
 668                        default: type = LLM_TYPE_UNKNOWN;
 669                    }
 670                }
 671            } break;
 672        case LLM_ARCH_LLAMA4:
 673            {
 674                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 675                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
 676                ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
 677
 678                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
 679                if (found_swa && hparams.n_swa == 0) {
 680                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
 681                    hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
 682                } else {
 683                    hparams.swa_type                = LLAMA_SWA_TYPE_CHUNKED;
 684                    hparams.n_swa                   = 8192;
 685                    hparams.n_attn_temp_floor_scale = 8192;
 686                    hparams.f_attn_temp_scale       = 0.1f;
 687                    hparams.f_attn_temp_offset      = 1.0f;
 688                    hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
 689
 690                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
 691                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
 692                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
 693                }
 694
 695                switch (hparams.n_expert) {
 696                    case 0: {
 697                        // MobileLLM (no MoE)
 698                        switch (hparams.n_embd) {
 699                            case 2048: type = LLM_TYPE_140M; break;
 700                            case 4096: type = LLM_TYPE_360M; break;
 701                            case 6144: type = LLM_TYPE_950M; break;
 702                            default:   type = LLM_TYPE_UNKNOWN;
 703                        }
 704                    } break;
 705                    case 16:  type = LLM_TYPE_17B_16E; break;
 706                    case 128: type = LLM_TYPE_17B_128E; break;
 707                    default:  type = LLM_TYPE_UNKNOWN;
 708                }
 709
 710                hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
 711            } break;
 712        case LLM_ARCH_ARCEE:
 713            {
 714                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 715
 716                // Arcee uses the same structure as Llama
 717                switch (hparams.n_layer) {
 718                    case 36: type = LLM_TYPE_4B; break;
 719                    default: type = LLM_TYPE_UNKNOWN;
 720                }
 721            } break;
 722        case LLM_ARCH_AFMOE:
 723            {
 724                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 725                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
 726                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
 727                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
 728                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
 729                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
 730                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
 731                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
 732
 733                // Set up interleaved sliding window attention (ISWA)
 734                // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
 735                if (hparams.n_swa > 0) {
 736                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
 737                    hparams.set_swa_pattern(4);
 738
 739                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
 740                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
 741                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
 742                } else {
 743                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
 744                }
 745
 746                // Default to sigmoid if not set
 747                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
 748                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
 749                }
 750
 751                switch (hparams.n_layer) {
 752                    case 56: type = LLM_TYPE_6B; break;
 753                    case 32: type = LLM_TYPE_26B; break;
 754                    default: type = LLM_TYPE_UNKNOWN;
 755                }
 756            } break;
 757        case LLM_ARCH_DECI:
 758            {
 759                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 760                switch (hparams.n_layer) {
 761                    case 32: type = LLM_TYPE_7B; break;
 762                    case 80: type = LLM_TYPE_70B; break;
 763                    case 162: type = LLM_TYPE_405B; break;
 764                    default: type = LLM_TYPE_UNKNOWN;
 765                }
 766            } break;
 767        case LLM_ARCH_MINICPM:
 768            {
 769                // Backward-compatible defaults for older MiniCPM GGUFs
 770                hparams.f_embedding_scale = 12.0f;
 771                hparams.f_residual_scale  = 1.4f / sqrtf(float(hparams.n_layer));
 772                hparams.f_logit_scale     = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
 773
 774                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 775
 776                // Optional KV reads, override defaults if present in newer GGUF exports
 777                ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
 778                ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
 779                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
 780
 781                // MiniCPM uses rope by default, unlike Granite which uses it as a switch
 782                hparams.rope_finetuned = true;
 783
 784                switch (hparams.n_layer) {
 785                    case 52: type = LLM_TYPE_1B; break;
 786                    case 40: type = LLM_TYPE_2B; break;
 787                    default: type = LLM_TYPE_UNKNOWN;
 788                }
 789            } break;
 790        case LLM_ARCH_MINICPM3:
 791            {
 792                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 793                ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
 794                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
 795
 796                switch (hparams.n_layer) {
 797                    case 62: type = LLM_TYPE_4B; break;
 798                    default: type = LLM_TYPE_UNKNOWN;
 799                }
 800            } break;
 801        case LLM_ARCH_GROK:
 802            {
 803                // defaults for old GGUFs
 804                hparams.yarn_beta_fast = 8.0f;
 805                hparams.f_logit_scale = 0.5773502691896257f;
 806                hparams.f_embedding_scale = 78.38367176906169f;
 807                hparams.f_attn_out_scale = 0.08838834764831845f;
 808                hparams.f_attn_logit_softcapping = 30.0f;
 809                hparams.f_router_logit_softcapping = 30.0f;
 810                // no final_logit_softcapping in grok-1
 811                hparams.f_final_logit_softcapping = 0.0f;
 812
 813                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
 814                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp, false);
 815                ml.get_key(LLM_KV_LOGIT_SCALE,                  hparams.f_logit_scale, false);
 816                ml.get_key(LLM_KV_EMBEDDING_SCALE,              hparams.f_embedding_scale, false);
 817                ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE,       hparams.f_attn_out_scale, false);
 818                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,       hparams.f_attn_logit_softcapping, false);
 819                ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING,     hparams.f_router_logit_softcapping, false);
 820                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,      hparams.f_final_logit_softcapping, false);
 821
 822                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH,  hparams.attn_temp_length, false);
 823                ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,  hparams.yarn_ext_factor, false);
 824                ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
 825                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
 826                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
 827
 828                switch (hparams.n_layer) {
 829                    case 64: type = LLM_TYPE_314B; break;
 830                    default: type = LLM_TYPE_UNKNOWN;
 831                }
 832            } break;
 833        case LLM_ARCH_FALCON:
 834            {
 835                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 836
 837                switch (hparams.n_layer) {
 838                    case 32: type = LLM_TYPE_7B; break;
 839                    case 60: type = LLM_TYPE_40B; break;
 840                    default: type = LLM_TYPE_UNKNOWN;
 841                }
 842            } break;
 843        case LLM_ARCH_BAICHUAN:
 844            {
 845                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 846                switch (hparams.n_layer) {
 847                    case 32: type = LLM_TYPE_7B; break;
 848                    case 40: type = LLM_TYPE_13B; break;
 849                    default: type = LLM_TYPE_UNKNOWN;
 850                }
 851
 852                if (type == LLM_TYPE_13B) {
 853                    // TODO: become GGUF KV parameter
 854                    hparams.f_max_alibi_bias = 8.0f;
 855                }
 856            } break;
 857        case LLM_ARCH_STARCODER:
 858            {
 859                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 860                switch (hparams.n_layer) {
 861                    case 24: type = LLM_TYPE_1B; break;
 862                    case 36: type = LLM_TYPE_3B; break;
 863                    case 42: type = LLM_TYPE_7B; break;
 864                    case 40: type = LLM_TYPE_15B; break;
 865                    default: type = LLM_TYPE_UNKNOWN;
 866                }
 867            } break;
 868        case LLM_ARCH_REFACT:
 869            {
 870                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 871                switch (hparams.n_layer) {
 872                    case 32: type = LLM_TYPE_1B; break;
 873                    default: type = LLM_TYPE_UNKNOWN;
 874                }
 875
 876                // TODO: become GGUF KV parameter
 877                hparams.f_max_alibi_bias = 8.0f;
 878            } break;
 879        case LLM_ARCH_BERT:
 880            {
 881                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
 882                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
 883                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
 884
 885                switch (hparams.n_layer) {
 886                    case 3:
 887                        type = LLM_TYPE_17M; break; // bge-micro
 888                    case 6:
 889                        type = LLM_TYPE_22M; break; // MiniLM-L6
 890                    case 12:
 891                        switch (hparams.n_embd) {
 892                            case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
 893                            case 768: type = LLM_TYPE_109M; break; // bge-base
 894                            default: type = LLM_TYPE_UNKNOWN;
 895                        } break;
 896                    case 24:
 897                        type = LLM_TYPE_335M; break; // bge-large
 898                    default: type = LLM_TYPE_UNKNOWN;
 899                }
 900            } break;
 901        case LLM_ARCH_MODERN_BERT:
 902            {
 903                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
 904                if (found_swa && hparams.n_swa > 0) {
 905                    uint32_t swa_period = 3;
 906                    hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
 907
 908                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
 909                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
 910                    hparams.set_swa_pattern(swa_period);
 911                } else {
 912                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
 913                }
 914
 915                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 916                ml.get_key(LLM_KV_ATTENTION_CAUSAL,        hparams.causal_attn);
 917                ml.get_key(LLM_KV_POOLING_TYPE,            hparams.pooling_type, false);
 918
 919                switch (hparams.n_layer) {
 920                    case 12:
 921                        type = LLM_TYPE_47M; break; // granite-embedding-small
 922                    case 22:
 923                        type = LLM_TYPE_149M; break; // modern-bert-base
 924                    case 28:
 925                        type = LLM_TYPE_395M; break; // modern-bert-large
 926                    default: type = LLM_TYPE_UNKNOWN;
 927                }
 928            } break;
 929        case LLM_ARCH_JINA_BERT_V2:
 930            {
 931                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
 932                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
 933                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
 934                hparams.f_max_alibi_bias = 8.0f;
 935
 936                switch (hparams.n_layer) {
 937                    case 4:  type = LLM_TYPE_33M;  break; // jina-embeddings-small
 938                    case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
 939                    default: type = LLM_TYPE_UNKNOWN;
 940                }
 941            } break;
 942        case LLM_ARCH_JINA_BERT_V3:
 943            {
 944                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
 945                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
 946                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
 947
 948                switch (hparams.n_layer) {
 949                    case 24:
 950                        type = LLM_TYPE_558M; break;
 951                    default: type = LLM_TYPE_UNKNOWN;
 952                }
 953            } break;
 954        case LLM_ARCH_NOMIC_BERT:
 955        case LLM_ARCH_NOMIC_BERT_MOE:
 956            {
 957                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
 958                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
 959                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
 960                ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
 961
 962                if (hparams.n_layer == 12 && hparams.n_embd == 768) {
 963                    if (arch == LLM_ARCH_NOMIC_BERT) {
 964                        type = LLM_TYPE_137M;
 965                    } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
 966                        type = LLM_TYPE_475M;
 967                    }
 968                }
 969            } break;
 970        case LLM_ARCH_NEO_BERT:
 971            {
 972                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 973                ml.get_key(LLM_KV_ATTENTION_CAUSAL,            hparams.causal_attn);
 974                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type);
 975
 976                if (hparams.n_layer == 28) {
 977                    type = LLM_TYPE_250M;
 978                }
 979            } break;
 980        case LLM_ARCH_BLOOM:
 981            {
 982                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 983
 984                switch (hparams.n_layer) {
 985                    case 24: type = LLM_TYPE_1B; break;
 986                    case 30:
 987                        switch (hparams.n_embd) {
 988                            case 2560: type = LLM_TYPE_3B; break;
 989                            case 4096: type = LLM_TYPE_7B; break;
 990                            default: type = LLM_TYPE_UNKNOWN;
 991                        } break;
 992                    default: type = LLM_TYPE_UNKNOWN;
 993                }
 994
 995                // TODO: become GGUF KV parameter
 996                hparams.f_max_alibi_bias = 8.0f;
 997            } break;
 998        case LLM_ARCH_MPT:
 999            {
1000                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
1001                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
1002                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
1003
1004                switch (hparams.n_layer) {
1005                    case 32: type = LLM_TYPE_7B; break;
1006                    case 48: type = LLM_TYPE_30B; break;
1007                    default: type = LLM_TYPE_UNKNOWN;
1008                }
1009            } break;
1010        case LLM_ARCH_STABLELM:
1011            {
1012                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1013
1014                switch (hparams.n_layer) {
1015                    case 24: type = LLM_TYPE_1B; break;
1016                    case 32: type = LLM_TYPE_3B; break;
1017                    case 40: type = LLM_TYPE_12B; break;
1018                    default: type = LLM_TYPE_UNKNOWN;
1019               }
1020            } break;
1021        case LLM_ARCH_QWEN:
1022            {
1023                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1024
1025                switch (hparams.n_layer) {
1026                    case 32: type = LLM_TYPE_7B; break;
1027                    case 40: type = LLM_TYPE_13B; break;
1028                    default: type = LLM_TYPE_UNKNOWN;
1029                }
1030            } break;
1031        case LLM_ARCH_QWEN2VL:
1032            {
1033                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1034            }
1035            // fall through
1036        case LLM_ARCH_QWEN2:
1037            {
1038                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
1039                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1040                switch (hparams.n_layer) {
1041                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
1042                    case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
1043                    case 32: type = LLM_TYPE_7B; break;
1044                    case 36: type = LLM_TYPE_3B; break;
1045                    case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
1046                    case 48: type = LLM_TYPE_14B; break;
1047                    case 64: type = LLM_TYPE_32B; break;
1048                    case 80: type = LLM_TYPE_70B; break;
1049                    default: type = LLM_TYPE_UNKNOWN;
1050                }
1051            } break;
1052        case LLM_ARCH_DREAM:
1053            {
1054                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1055                // Dream models are primarily 7B with 28 layers
1056                switch (hparams.n_layer) {
1057                    case 28:
1058                        type = LLM_TYPE_7B;
1059                        break;
1060                    default:
1061                        type = LLM_TYPE_UNKNOWN;
1062                }
1063                // Set non-causal attention for diffusion models
1064                hparams.causal_attn = false;
1065            }
1066            break;
1067        case LLM_ARCH_LLADA:
1068            {
1069                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1070                // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
1071                switch (hparams.n_layer) {
1072                    case 32:
1073                        type = LLM_TYPE_8B;
1074                        break;
1075                    default:
1076                        type = LLM_TYPE_UNKNOWN;
1077                }
1078                // Set non-causal attention for diffusion models
1079                hparams.causal_attn = false;
1080            }
1081            break;
1082        case LLM_ARCH_LLADA_MOE:
1083            {
1084                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1085
1086                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1087                // diffusion language model uses non-causal attention
1088                hparams.causal_attn = false;
1089                switch (hparams.n_layer) {
1090                    case 16: type = LLM_TYPE_A1_7B; break;
1091                    default: type = LLM_TYPE_UNKNOWN;
1092                }
1093            } break;
1094        case LLM_ARCH_RND1:
1095            {
1096                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1097
1098                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1099                switch (hparams.n_layer) {
1100                    case 48: type = LLM_TYPE_30B_A3B; break;
1101                    default: type = LLM_TYPE_UNKNOWN;
1102                }
1103                // Set non-causal attention for diffusion models
1104                hparams.causal_attn = false;
1105            } break;
1106        case LLM_ARCH_QWEN2MOE:
1107            {
1108                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
1109                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1110
1111                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1112                switch (hparams.n_layer) {
1113                    case 24: type = LLM_TYPE_A2_7B; break;
1114                    case 28: type = LLM_TYPE_57B_A14B; break;
1115                    default: type = LLM_TYPE_UNKNOWN;
1116                }
1117            } break;
1118        case LLM_ARCH_QWEN3:
1119            {
1120                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
1121                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1122                switch (hparams.n_layer) {
1123                    case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
1124                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1125                    case 40: type = LLM_TYPE_14B; break;
1126                    case 64: type = LLM_TYPE_32B; break;
1127                    default: type = LLM_TYPE_UNKNOWN;
1128                }
1129            } break;
1130        case LLM_ARCH_MAINCODER:
1131            {
1132                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1133                switch (hparams.n_layer) {
1134                    case 32: type = LLM_TYPE_1B; break;
1135                    default: type = LLM_TYPE_UNKNOWN;
1136                }
1137            } break;
1138        case LLM_ARCH_QWEN3VL:
1139            {
1140                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
1141                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1142                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1143                switch (hparams.n_layer) {
1144                    case 28: type = LLM_TYPE_1_7B; break;
1145                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1146                    case 64: type = LLM_TYPE_32B; break;
1147                    default: type = LLM_TYPE_UNKNOWN;
1148                }
1149            } break;
1150        case LLM_ARCH_QWEN3MOE:
1151            {
1152                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
1153
1154                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1155                switch (hparams.n_layer) {
1156                    case 48: type = LLM_TYPE_30B_A3B; break;
1157                    case 94: type = LLM_TYPE_235B_A22B; break;
1158                    default: type = LLM_TYPE_UNKNOWN;
1159                }
1160            } break;
1161        case LLM_ARCH_QWEN3VLMOE:
1162            {
1163                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
1164                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1165                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1166                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1167                switch (hparams.n_layer) {
1168                    case 48: type = LLM_TYPE_30B_A3B; break;
1169                    case 94: type = LLM_TYPE_235B_A22B; break;
1170                    default: type = LLM_TYPE_UNKNOWN;
1171                }
1172            } break;
1173        case LLM_ARCH_PHI2:
1174            {
1175                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1176
1177                switch (hparams.n_layer) {
1178                    case 24: type = LLM_TYPE_1B; break;
1179                    case 32: type = LLM_TYPE_3B; break;
1180                    default: type = LLM_TYPE_UNKNOWN;
1181                }
1182            } break;
1183        case LLM_ARCH_PHI3:
1184            {
1185                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1186
1187                switch (hparams.n_layer) {
1188                    case 24: type = LLM_TYPE_1B; break;
1189                    case 32: type = LLM_TYPE_3B; break;
1190                    case 40: type = LLM_TYPE_14B; break;
1191                    default: type = LLM_TYPE_UNKNOWN;
1192                }
1193
1194                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1195
1196                if (found_swa && hparams.n_swa > 0) {
1197                    LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
1198                            __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
1199
1200                    // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
1201                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1202
1203                    hparams.n_swa         = 0;
1204                    hparams.set_swa_pattern(1);
1205                }
1206            } break;
1207        case LLM_ARCH_PHIMOE:
1208            {
1209                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1210
1211                switch (hparams.n_layer) {
1212                    case 32: type = LLM_TYPE_16x3_8B; break;
1213                    default: type = LLM_TYPE_UNKNOWN;
1214                }
1215            } break;
1216        case LLM_ARCH_PLAMO:
1217            {
1218                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1219
1220                switch (hparams.n_layer) {
1221                    case 40: type = LLM_TYPE_13B; break;
1222                    default: type = LLM_TYPE_UNKNOWN;
1223               }
1224            } break;
1225        case LLM_ARCH_PLAMO2:
1226            {
1227                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1228
1229                // Load Mamba SSM parameters
1230                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1231                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1232                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1233                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1234                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1235
1236                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1237                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1238                }
1239
1240                switch (hparams.n_layer) {
1241                    case 16: type = LLM_TYPE_1B; break;
1242                    case 32:
1243                        if (hparams.n_embd == 2048) {
1244                            type = LLM_TYPE_2B;
1245                        } else if (hparams.n_embd == 4096) {
1246                            type = LLM_TYPE_8B;
1247                        }
1248                        break;
1249                    default: type = LLM_TYPE_UNKNOWN;
1250                }
1251
1252                // Load attention parameters
1253                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH,   hparams.n_embd_head_k, false);
1254                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
1255            } break;
1256        case LLM_ARCH_PLAMO3:
1257            {
1258                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1259                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1260                if (found_swa && hparams.n_swa > 0) {
1261                    uint32_t swa_period = 8;
1262                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1263                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
1264                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1265                    hparams.set_swa_pattern(swa_period);
1266                } else {
1267                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1268                }
1269
1270                switch (hparams.n_layer) {
1271                    case 24: type = LLM_TYPE_2B; break;
1272                    default: type = LLM_TYPE_UNKNOWN;
1273                }
1274            } break;
1275        case LLM_ARCH_GPT2:
1276            {
1277                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1278                switch (hparams.n_layer) {
1279                    case 12: type = LLM_TYPE_SMALL; break;
1280                    case 24: type = LLM_TYPE_MEDIUM; break;
1281                    case 36: type = LLM_TYPE_LARGE; break;
1282                    case 48: type = LLM_TYPE_XL; break;
1283                    default: type = LLM_TYPE_UNKNOWN;
1284                }
1285            } break;
1286        case LLM_ARCH_CODESHELL:
1287            {
1288                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1289                switch (hparams.n_layer) {
1290                    case 42: type = LLM_TYPE_7B; break;
1291                    default: type = LLM_TYPE_UNKNOWN;
1292                }
1293            } break;
1294        case LLM_ARCH_ORION:
1295            {
1296                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1297
1298                switch (hparams.n_layer) {
1299                    case 40: type = LLM_TYPE_14B; break;
1300                    default: type = LLM_TYPE_UNKNOWN;
1301                }
1302            } break;
1303        case LLM_ARCH_INTERNLM2:
1304            {
1305                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1306                switch (hparams.n_layer) {
1307                    case 32: type = LLM_TYPE_7B; break;
1308                    case 48: type = LLM_TYPE_20B; break;
1309                    default: type = LLM_TYPE_UNKNOWN;
1310                }
1311            } break;
1312        case LLM_ARCH_GEMMA:
1313            {
1314                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1315
1316                switch (hparams.n_layer) {
1317                    case 18: type = LLM_TYPE_2B; break;
1318                    case 28: type = LLM_TYPE_7B; break;
1319                    default: type = LLM_TYPE_UNKNOWN;
1320               }
1321            } break;
1322        case LLM_ARCH_GEMMA2:
1323            {
1324                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1325                hparams.n_swa = 4096; // default value of gemma 2
1326                hparams.set_swa_pattern(2);
1327                hparams.attn_soft_cap = true;
1328                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
1329                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1330
1331                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
1332                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
1333                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1334                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
1335                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
1336
1337                switch (hparams.n_layer) {
1338                    case 26: type = LLM_TYPE_2B; break;
1339                    case 42: type = LLM_TYPE_9B; break;
1340                    case 46: type = LLM_TYPE_27B; break;
1341                    default: type = LLM_TYPE_UNKNOWN;
1342               }
1343
1344                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
1345                hparams.f_attention_scale = type == LLM_TYPE_27B
1346                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1347                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1348            } break;
1349        case LLM_ARCH_GEMMA3:
1350            {
1351                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1352                if (found_swa && hparams.n_swa > 0) {
1353                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1354                    hparams.set_swa_pattern(6);
1355
1356                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1357                } else {
1358                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1359                }
1360
1361                hparams.f_final_logit_softcapping = 0.0f;
1362                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
1363                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1364
1365                switch (hparams.n_layer) {
1366                    case 18: type = LLM_TYPE_270M; break;
1367                    case 26: type = LLM_TYPE_1B; break;
1368                    case 32: type = LLM_TYPE_8B; break; // Rnj-1
1369                    case 34: type = LLM_TYPE_4B; break;
1370                    case 48: type = LLM_TYPE_12B; break;
1371                    case 62: type = LLM_TYPE_27B; break;
1372                    default: type = LLM_TYPE_UNKNOWN;
1373                }
1374
1375                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
1376                hparams.f_attention_scale = type == LLM_TYPE_27B
1377                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1378                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1379            } break;
1380        case LLM_ARCH_GEMMA3N:
1381            {
1382                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1383                hparams.set_swa_pattern(5);
1384
1385                hparams.n_layer_kv_from_start     = 20;
1386                hparams.f_attention_scale         = 1.0f;
1387
1388                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
1389                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
1390                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1391
1392                switch (hparams.n_layer) {
1393                    case 30: type = LLM_TYPE_E2B; break;
1394                    case 35: type = LLM_TYPE_E4B; break;
1395                    default: type = LLM_TYPE_UNKNOWN;
1396                }
1397            } break;
1398        case LLM_ARCH_GEMMA_EMBEDDING:
1399            {
1400                hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
1401                hparams.set_swa_pattern(6);
1402
1403                hparams.causal_attn = false; // embeddings do not use causal attention
1404
1405                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1406                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1407                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1408                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1409
1410                //applied only if model converted with --sentence-transformers-dense-modules
1411                ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
1412                ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
1413                ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
1414                ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
1415
1416                GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
1417                GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
1418
1419                switch (hparams.n_layer) {
1420                    case 24: type = LLM_TYPE_0_3B; break;
1421                    default: type = LLM_TYPE_UNKNOWN;
1422                }
1423                hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1424
1425            } break;
1426        case LLM_ARCH_STARCODER2:
1427            {
1428                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1429                switch (hparams.n_layer) {
1430                    case 30: type = LLM_TYPE_3B; break;
1431                    case 32: type = LLM_TYPE_7B; break;
1432                    case 40: type = LLM_TYPE_15B; break;
1433                    case 52: type = LLM_TYPE_20B; break; // granite
1434                    case 88: type = LLM_TYPE_34B; break; // granite
1435                    default: type = LLM_TYPE_UNKNOWN;
1436                }
1437            } break;
1438        case LLM_ARCH_MAMBA:
1439            {
1440                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1441                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1442                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1443                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1444                ml.get_key(LLM_KV_SSM_DT_B_C_RMS,     hparams.ssm_dt_b_c_rms, false);
1445
1446                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1447
1448                switch (hparams.n_layer) {
1449                    case 24:
1450                        switch (hparams.n_embd) {
1451                            case 768: type = LLM_TYPE_SMALL; break;
1452                            default: type = LLM_TYPE_UNKNOWN;
1453                        } break;
1454                    case 48:
1455                        switch (hparams.n_embd) {
1456                            case 1024: type = LLM_TYPE_MEDIUM; break;
1457                            case 1536: type = LLM_TYPE_LARGE; break;
1458                            case 2048: type = LLM_TYPE_XL; break;
1459                            default:   type = LLM_TYPE_UNKNOWN;
1460                        } break;
1461                    case 64:
1462                        switch (hparams.n_embd) {
1463                            case 2560: type = LLM_TYPE_3B; break;
1464                            default: type = LLM_TYPE_UNKNOWN;
1465                        } break;
1466                    default: type = LLM_TYPE_UNKNOWN;
1467                }
1468            } break;
1469        case LLM_ARCH_MAMBA2:
1470            {
1471                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1472                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1473                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1474                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1475                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1476
1477                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1478
1479                switch (hparams.n_layer) {
1480                    case 24:
1481                        switch (hparams.n_embd) {
1482                            case 768: type = LLM_TYPE_SMALL; break;
1483                            default: type = LLM_TYPE_UNKNOWN;
1484                        } break;
1485                    case 48:
1486                        switch (hparams.n_embd) {
1487                            case 1024: type = LLM_TYPE_MEDIUM; break;
1488                            case 1536: type = LLM_TYPE_LARGE; break;
1489                            case 2048: type = LLM_TYPE_XL; break;
1490                            default: type = LLM_TYPE_UNKNOWN;
1491                        } break;
1492                    case 64:
1493                        switch (hparams.n_embd) {
1494                            case 2560: type = LLM_TYPE_3B; break;
1495                            case 4096: type = LLM_TYPE_7B; break;
1496                            default: type = LLM_TYPE_UNKNOWN;
1497                        } break;
1498                    default: type = LLM_TYPE_UNKNOWN;
1499                }
1500            } break;
1501        case LLM_ARCH_JAMBA:
1502            {
1503                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1504                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1505                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1506                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1507
1508                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1509
1510                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1511                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1512                }
1513
1514                switch (hparams.n_layer) {
1515                    // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
1516                    case 12: // 900M  8x???M
1517                    case 32: // 51B  16x?B
1518                    default: type = LLM_TYPE_UNKNOWN;
1519                }
1520            } break;
1521        case LLM_ARCH_XVERSE:
1522            {
1523                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1524                switch (hparams.n_layer) {
1525                    case 32: type = LLM_TYPE_7B; break;
1526                    case 40: type = LLM_TYPE_13B; break;
1527                    case 80: type = LLM_TYPE_65B; break;
1528                    default: type = LLM_TYPE_UNKNOWN;
1529                }
1530            } break;
1531        case LLM_ARCH_COMMAND_R:
1532            {
1533                ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale);
1534                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1535                switch (hparams.n_layer) {
1536                    case 40: type = LLM_TYPE_35B; break;
1537                    default: type = LLM_TYPE_UNKNOWN;
1538                }
1539            } break;
1540        case LLM_ARCH_COHERE2:
1541            {
1542                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1543                hparams.set_swa_pattern(4);
1544                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
1545                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1546
1547                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,       hparams.rope_freq_base_train_swa, false);
1548                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1549                ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
1550                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
1551                switch (hparams.n_layer) {
1552                    case 32: type = LLM_TYPE_8B; break;
1553                    default: type = LLM_TYPE_UNKNOWN;
1554                }
1555            } break;
1556        case LLM_ARCH_DBRX:
1557        {
1558            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1559            ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);
1560
1561            switch (hparams.n_layer) {
1562                case 40: type = LLM_TYPE_16x12B; break;
1563                default: type = LLM_TYPE_UNKNOWN;
1564            }
1565        } break;
1566        case LLM_ARCH_OLMO:
1567            {
1568                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1569                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv, false);
1570
1571                switch (hparams.n_layer) {
1572                    case 22: type = LLM_TYPE_1B; break;
1573                    case 32: type = LLM_TYPE_7B; break;
1574                    case 80: type = LLM_TYPE_70B; break;
1575                    default: type = LLM_TYPE_UNKNOWN;
1576                }
1577            } break;
1578        case LLM_ARCH_OLMO2:
1579            {
1580                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1581
1582                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1583                if (found_swa && hparams.n_swa > 0) {
1584                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1585                    hparams.set_swa_pattern(4);
1586
1587                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
1588                    hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
1589                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1590                } else {
1591                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1592                }
1593
1594                switch (hparams.n_layer) {
1595                    case 16: type = LLM_TYPE_1B; break;
1596                    case 32: type = LLM_TYPE_7B; break;
1597                    case 40: type = LLM_TYPE_13B; break;
1598                    case 64: type = LLM_TYPE_32B; break;
1599                    default: type = LLM_TYPE_UNKNOWN;
1600                }
1601            } break;
1602        case LLM_ARCH_SEED_OSS:
1603            {
1604                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1605                switch (hparams.n_layer) {
1606                    case 64: type = LLM_TYPE_36B; break;
1607                    default: type = LLM_TYPE_UNKNOWN;
1608                }
1609            } break;
1610        case LLM_ARCH_OLMOE:
1611            {
1612                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1613                switch (hparams.n_layer) {
1614                    case 16: type = LLM_TYPE_A1_7B; break;
1615                    default: type = LLM_TYPE_UNKNOWN;
1616                }
1617            } break;
1618        case LLM_ARCH_OPENELM:
1619            {
1620                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1621
1622                switch (hparams.n_layer) {
1623                case 16: type = LLM_TYPE_270M; break;
1624                case 20: type = LLM_TYPE_450M; break;
1625                case 28: type = LLM_TYPE_1B; break;
1626                case 36: type = LLM_TYPE_3B; break;
1627                default: type = LLM_TYPE_UNKNOWN;
1628                }
1629            } break;
1630        case LLM_ARCH_GPTNEOX:
1631            {
1632                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1633                ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL,   hparams.use_par_res);
1634                switch (hparams.n_layer) {
1635                    case 6:
1636                        switch (hparams.n_ff()) {
1637                            case 512:  type = LLM_TYPE_14M; break;
1638                            case 2048: type = LLM_TYPE_70M; break;
1639                            default:   type = LLM_TYPE_UNKNOWN;
1640                        } break;
1641                    case 12:
1642                        switch (hparams.n_ff()) {
1643                            case 3072: type = LLM_TYPE_160M; break;
1644                            default: type = LLM_TYPE_UNKNOWN;
1645                        } break;
1646                    case 16:
1647                        switch (hparams.n_ff()) {
1648                            case 8192: type = LLM_TYPE_1B; break;
1649                            default: type = LLM_TYPE_UNKNOWN;
1650                        } break;
1651                    case 24:
1652                        switch (hparams.n_ff()) {
1653                            case 4096: type = LLM_TYPE_410M; break;
1654                            case 8192: type = LLM_TYPE_1_4B; break;
1655                            default: type = LLM_TYPE_UNKNOWN;
1656                        } break;
1657                    case 32:
1658                        switch (hparams.n_ff()) {
1659                            case 10240: type = LLM_TYPE_2_8B; break;
1660                            case 16384: type = LLM_TYPE_6_9B; break;
1661                            default: type = LLM_TYPE_UNKNOWN;
1662                        } break;
1663                    case 36:
1664                        switch (hparams.n_ff()) {
1665                            case 20480: type = LLM_TYPE_12B; break;
1666                            default: type = LLM_TYPE_UNKNOWN;
1667                        } break;
1668                    case 44:
1669                        switch (hparams.n_ff()) {
1670                            case 24576: type = LLM_TYPE_20B; break;
1671                            default: type = LLM_TYPE_UNKNOWN;
1672                        } break;
1673                    default: type = LLM_TYPE_UNKNOWN;
1674                }
1675            } break;
1676        case LLM_ARCH_ARCTIC:
1677            {
1678                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1679
1680                if (hparams.n_expert == 128) {
1681                    switch (hparams.n_layer) {
1682                        case 35: type = LLM_TYPE_10B_128x3_66B; break;
1683                        default: type = LLM_TYPE_UNKNOWN;
1684                    }
1685                } else {
1686                    type = LLM_TYPE_UNKNOWN;
1687                }
1688            } break;
1689        case LLM_ARCH_DEEPSEEK:
1690            {
1691                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1692                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
1693                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
1694                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
1695                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
1696
1697                switch (hparams.n_ff_exp) {
1698                    case 1408: type = LLM_TYPE_16B; break;
1699                    case 1792: type = LLM_TYPE_20B; break;
1700                    default: type = LLM_TYPE_UNKNOWN;
1701                }
1702            } break;
1703        case LLM_ARCH_DEEPSEEK2:
1704            {
1705                // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
1706                const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
1707
1708                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1709                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
1710                if (!is_lite) {
1711                    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1712                }
1713                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
1714                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
1715                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
1716                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1717                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
1718                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale, false);
1719                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,        hparams.expert_weights_norm, false);
1720                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
1721                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1722                    // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
1723                    // that have no expert_gating_func model parameter set
1724                    if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
1725                        // GLM 4.7 Lite
1726                        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1727                    } else {
1728                        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1729                    }
1730                }
1731
1732                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
1733                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
1734                    // cancel the factor from the convert script
1735                    hparams.rope_yarn_log_mul /= 0.1f;
1736                }
1737
1738                // (optional) temperature tuning - used by mistral-large
1739                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
1740                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
1741
1742                hparams.f_attn_temp_offset = 0.0f;
1743
1744                switch (hparams.n_layer) {
1745                    case 27: type = LLM_TYPE_16B; break;
1746                    case 47: type = LLM_TYPE_30B_A3B; break;
1747                    case 60: type = LLM_TYPE_236B; break;
1748                    case 61: type = LLM_TYPE_671B; break;
1749                    default: type = LLM_TYPE_UNKNOWN;
1750                }
1751            } break;
1752        case LLM_ARCH_PLM:
1753            {
1754                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1755                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1756                switch (hparams.n_layer) {
1757                    case 32: type = LLM_TYPE_1_8B; break;
1758                    default: type = LLM_TYPE_UNKNOWN;
1759                }
1760            } break;
1761        case LLM_ARCH_CHATGLM:
1762            {
1763                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1764                switch (hparams.n_layer) {
1765                    case 28: {
1766                        if (hparams.n_head(0) == 16) {
1767                            type = LLM_TYPE_1_5B;
1768                        } else {
1769                            type = LLM_TYPE_6B;
1770                        }
1771                    } break;
1772                    case 40: {
1773                        if (hparams.n_head(0) == 24) {
1774                            type = LLM_TYPE_4B;
1775                        } else {
1776                            type = LLM_TYPE_9B;
1777                        }
1778                    } break;
1779                    default: type = LLM_TYPE_UNKNOWN;
1780                }
1781            } break;
1782        case LLM_ARCH_GLM4:
1783            {
1784                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
1785                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1786                switch (hparams.n_layer) {
1787                    case 40: type = LLM_TYPE_9B; break;
1788                    case 61: type = LLM_TYPE_32B; break;
1789                    default: type = LLM_TYPE_UNKNOWN;
1790                }
1791            } break;
1792        case LLM_ARCH_GLM4_MOE:
1793            {
1794                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
1795                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
1796                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1797
1798                // MoE parameters
1799                ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
1800                ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
1801                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
1802                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
1803                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
1804                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
1805
1806                // Expert gating function (GLM-4.5 uses sigmoid)
1807                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
1808                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1809                    hparams.expert_gating_func =  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1810                }
1811
1812                // NextN/MTP parameters
1813                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
1814
1815                // TODO: when MTP is implemented, this should probably be updated if needed
1816                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1817
1818                switch (hparams.n_layer) {
1819                    case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1820                    case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
1821                    case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1822                    default: type = LLM_TYPE_UNKNOWN;
1823                }
1824            } break;
1825        case LLM_ARCH_BITNET:
1826            {
1827                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1828
1829                switch (hparams.n_layer) {
1830                    case 26: type = LLM_TYPE_3B; break;
1831                    default: type = LLM_TYPE_UNKNOWN;
1832                }
1833            } break;
1834        case LLM_ARCH_T5:
1835            {
1836                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      hparams.f_norm_rms_eps);
1837                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
1838
1839                uint32_t dec_start_token_id;
1840                if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
1841                    hparams.dec_start_token_id = dec_start_token_id;
1842                }
1843
1844                hparams.dec_n_layer = hparams.n_layer;
1845                ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
1846
1847                switch (hparams.n_layer) {
1848                    case 6:  type = LLM_TYPE_60M;  break; // t5-small
1849                    case 8:  type = LLM_TYPE_80M;  break; // flan-t5-small
1850                    case 12:
1851                        switch (hparams.n_ff()) {
1852                            case 3072: type = LLM_TYPE_220M; break; // t5-base
1853                            case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
1854                            default: type = LLM_TYPE_UNKNOWN;
1855                        } break;
1856                    case 24:
1857                        switch (hparams.n_ff()) {
1858                            case 4096:  type = LLM_TYPE_770M; break; // t5-large
1859                            case 2816:  type = LLM_TYPE_780M; break; // flan-t5-large
1860                            case 16384: type = LLM_TYPE_3B;   break; // t5-3b
1861                            case 5120:  type = LLM_TYPE_3B;   break; // flan-t5-xl
1862                            case 65536: type = LLM_TYPE_11B;  break; // t5-11b
1863                            case 10240: type = LLM_TYPE_11B;  break; // flan-t5-xxl
1864                            default: type = LLM_TYPE_UNKNOWN;
1865                        } break;
1866                    default: type = LLM_TYPE_UNKNOWN;
1867               }
1868            } break;
1869        case LLM_ARCH_T5ENCODER:
1870            {
1871                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1872                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
1873                type = LLM_TYPE_UNKNOWN;
1874            } break;
1875        case LLM_ARCH_JAIS:
1876            {
1877                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1878                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
1879
1880                switch (hparams.n_layer) {
1881                    case 24: type = LLM_TYPE_1_3B; break;
1882                    case 40: type = LLM_TYPE_13B; break;
1883                    /* TODO: add variants */
1884                    default: type = LLM_TYPE_UNKNOWN;
1885                }
1886            } break;
1887        case LLM_ARCH_NEMOTRON:
1888            {
1889                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1890                switch (hparams.n_layer) {
1891                    case 32: type = LLM_TYPE_4B; break;
1892                    default: type = LLM_TYPE_UNKNOWN;
1893                }
1894            } break;
1895        case LLM_ARCH_NEMOTRON_H:
1896        case LLM_ARCH_NEMOTRON_H_MOE:
1897            {
1898                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1899                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1900                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1901                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1902                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1903
1904                // A layer is recurrent IFF the n_head_kv value is set to 0 and
1905                // the n_ff value is set to 0
1906                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1907                    hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
1908                }
1909
1910                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1911
1912                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp,        false);
1913                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp,      false);
1914                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
1915                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
1916                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
1917
1918                switch (hparams.n_layer) {
1919                    case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
1920                    case 56: type = LLM_TYPE_9B; break;
1921                    default: type = LLM_TYPE_UNKNOWN;
1922                }
1923            } break;
1924        case LLM_ARCH_EXAONE:
1925            {
1926                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1927
1928                switch (hparams.n_layer) {
1929                    case 32: type = LLM_TYPE_8B; break;
1930                    default: type = LLM_TYPE_UNKNOWN;
1931                }
1932            } break;
1933        case LLM_ARCH_EXAONE4:
1934            {
1935                if (hparams.n_layer == 64) {    // 32B
1936                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1937                    hparams.n_swa = 4096;
1938                    hparams.set_swa_pattern(4);
1939
1940                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
1941                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1942                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1943                }
1944
1945                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
1946                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1947
1948                switch (hparams.n_layer) {
1949                    case 30: type = LLM_TYPE_1_2B; break;
1950                    case 64: type = LLM_TYPE_32B; break;
1951                    default: type = LLM_TYPE_UNKNOWN;
1952                }
1953            } break;
1954        case LLM_ARCH_EXAONE_MOE:
1955            {
1956                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1957                hparams.n_swa = 128;
1958                hparams.set_swa_pattern(4);
1959                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
1960                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1961
1962                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,                hparams.rope_freq_base_train_swa, false);
1963                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
1964                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
1965                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
1966                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
1967                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1968                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
1969                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
1970                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
1971                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
1972
1973                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
1974
1975                switch (hparams.n_layer) {
1976                    case 32: type = LLM_TYPE_30B_A3B; break;
1977                    case 48:
1978                    case 49: type = LLM_TYPE_235B_A22B; break;
1979                    default: type = LLM_TYPE_UNKNOWN;
1980                }
1981            } break;
1982        case LLM_ARCH_RWKV6:
1983        case LLM_ARCH_RWKV6QWEN2:
1984            {
1985                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,     hparams.f_norm_eps, false);
1986                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
1987                ml.get_key(LLM_KV_WKV_HEAD_SIZE,               hparams.wkv_head_size);
1988                ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM,          hparams.time_mix_extra_dim);
1989                ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM,        hparams.time_decay_extra_dim);
1990                ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS,      hparams.rescale_every_n_layers, false);
1991                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,           hparams.token_shift_count, false);
1992
1993                switch (hparams.n_layer) {
1994                    case 24: type = LLM_TYPE_1_6B; break;
1995                    case 32:
1996                        switch (hparams.n_embd) {
1997                            case 2560: type = LLM_TYPE_3B; break;
1998                            case 4096: type = LLM_TYPE_7B; break;
1999                            default: type = LLM_TYPE_UNKNOWN;
2000                        } break;
2001                    case 61: type = LLM_TYPE_14B; break;
2002                    case 64: type = LLM_TYPE_32B; break;
2003                    default: type = LLM_TYPE_UNKNOWN;
2004                }
2005            } break;
2006        case LLM_ARCH_RWKV7:
2007        case LLM_ARCH_ARWKV7:
2008            {
2009                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,                hparams.f_norm_eps, false);
2010                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,            hparams.f_norm_rms_eps, false);
2011                ml.get_key(LLM_KV_WKV_HEAD_SIZE,                          hparams.wkv_head_size);
2012                ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK,              hparams.n_lora_decay);
2013                ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK,               hparams.n_lora_iclr);
2014                ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
2015                ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK,               hparams.n_lora_gate, false);
2016                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);
2017
2018                switch (hparams.n_layer) {
2019                    case 12:
2020                        switch (hparams.n_embd) {
2021                            case 768: type = LLM_TYPE_190M; break;
2022                            default: type = LLM_TYPE_UNKNOWN;
2023                        } break;
2024                    case 24:
2025                        switch (hparams.n_embd) {
2026                            case 1024: type = LLM_TYPE_450M; break;
2027                            case 2048: type = LLM_TYPE_1_5B; break;
2028                            default: type = LLM_TYPE_UNKNOWN;
2029                        } break;
2030                    case 28:
2031                        switch (hparams.n_embd) {
2032                            case 1536: type = LLM_TYPE_1_5B; break;
2033                            case 3584: type = LLM_TYPE_7B; break;
2034                            default: type = LLM_TYPE_UNKNOWN;
2035                        } break;
2036                    case 32:
2037                        switch (hparams.n_embd) {
2038                            case 2560: type = LLM_TYPE_2_9B; break;
2039                            case 4096: type = LLM_TYPE_7B; break;
2040                            default: type = LLM_TYPE_UNKNOWN;
2041                        } break;
2042                    case 61:
2043                        switch (hparams.n_embd) {
2044                            case 4096: type = LLM_TYPE_14B; break;
2045                            default: type = LLM_TYPE_UNKNOWN;
2046                        } break;
2047                    default: type = LLM_TYPE_UNKNOWN;
2048                }
2049            } break;
2050        case LLM_ARCH_GRANITE:
2051        case LLM_ARCH_GRANITE_MOE:
2052            {
2053                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2054                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
2055                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale);
2056                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale);
2057                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale);
2058
2059                // Granite uses rope_finetuned as a switch for rope, so default to true
2060                bool rope_finetuned = true;
2061                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
2062                hparams.rope_finetuned = rope_finetuned;
2063
2064                switch (hparams.n_layer) {
2065                    case 32: type = LLM_TYPE_3B; break;
2066                    case 40: type = LLM_TYPE_3B; break;
2067                    // Add additional layer/vocab/etc checks here for other model sizes
2068                    default: type = LLM_TYPE_UNKNOWN;
2069                }
2070
2071                // For Granite MoE Shared
2072                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
2073            } break;
2074        case LLM_ARCH_GRANITE_HYBRID:
2075            {
2076                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2077                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale, /* required */ false);
2078                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, /* required */ false);
2079                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, /* required */ false);
2080                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, /* required */ false);
2081
2082                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2083                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2084                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2085                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2086                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2087
2088                // Granite uses rope_finetuned as a switch for rope, so default to true
2089                bool rope_finetuned = true;
2090                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
2091                hparams.rope_finetuned = rope_finetuned;
2092
2093                // A layer is recurrent IFF the n_head_kv value is set to 0
2094                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2095                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
2096                }
2097
2098                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2099
2100                switch (hparams.n_embd) {
2101                    case 768: type = LLM_TYPE_350M; break;
2102                    case 1536: type = (hparams.n_embd == 2048 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
2103                    case 2048: case 2560: type = LLM_TYPE_3B; break;
2104                    case 4096: type = LLM_TYPE_32B; break;
2105                    default: type = LLM_TYPE_UNKNOWN;
2106                }
2107
2108                // For Granite MoE Shared
2109                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
2110            } break;
2111        case LLM_ARCH_CHAMELEON:
2112            {
2113                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2114                hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
2115                ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
2116
2117                switch (hparams.n_layer) {
2118                    case 32: type = LLM_TYPE_7B; break;
2119                    case 48: type = LLM_TYPE_34B; break;
2120                    default: type = LLM_TYPE_UNKNOWN;
2121               }
2122            } break;
2123        case LLM_ARCH_WAVTOKENIZER_DEC:
2124            {
2125                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
2126                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
2127                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
2128                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
2129            } break;
2130        case LLM_ARCH_BAILINGMOE:
2131            {
2132                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2133                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
2134                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2135                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
2136                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
2137                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
2138
2139                switch (hparams.n_layer) {
2140                    case 28: type = LLM_TYPE_16B; break;
2141                    case 88: type = LLM_TYPE_290B; break;
2142                    default: type = LLM_TYPE_UNKNOWN;
2143                }
2144            } break;
2145        case LLM_ARCH_BAILINGMOE2:
2146            {
2147                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2148                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
2149                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2150                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
2151                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
2152                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
2153                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
2154                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
2155                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
2156
2157                // TODO: when MTP is implemented, this should probably be updated if needed
2158                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
2159
2160                switch (hparams.n_layer) {
2161                    case 20: type = LLM_TYPE_16B_A1B; break;
2162                    case 21: type = LLM_TYPE_16B_A1B; break;
2163                    case 32: type = LLM_TYPE_100B_A6B; break;
2164                    case 33: type = LLM_TYPE_100B_A6B; break;
2165                    default: type = LLM_TYPE_UNKNOWN;
2166                }
2167            } break;
2168        case LLM_ARCH_DOTS1:
2169            {
2170                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2171                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
2172                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2173                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
2174                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
2175                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
2176                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2177                switch (hparams.n_layer) {
2178                    case 62: type = LLM_TYPE_142B; break;
2179                    default: type = LLM_TYPE_UNKNOWN;
2180                }
2181            } break;
2182        case LLM_ARCH_ERNIE4_5:
2183        case LLM_ARCH_ERNIE4_5_MOE:
2184            {
2185                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2186                if (arch == LLM_ARCH_ERNIE4_5_MOE) {
2187                    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2188                    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2189                    ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         hparams.n_moe_layer_step);
2190                    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
2191                }
2192
2193                switch (hparams.n_layer) {
2194                    case 18: type = LLM_TYPE_0_3B; break;
2195                    case 28: type = LLM_TYPE_21B_A3B; break;
2196                    case 54: type = LLM_TYPE_300B_A47B; break;
2197                    default: type = LLM_TYPE_UNKNOWN;
2198                }
2199            } break;
2200        case LLM_ARCH_FALCON_H1:
2201            {
2202                // Common parameters
2203                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2204
2205                // SSM parameters
2206                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2207                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2208                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2209                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2210                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2211
2212                std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
2213
2214                switch (hparams.n_layer) {
2215                    case 36:
2216                        type = LLM_TYPE_0_5B; break;
2217                    case 24:
2218                        type = LLM_TYPE_1_5B; break;
2219                    case 66:
2220                        type = LLM_TYPE_1B; break;
2221                    case 32:
2222                        type = LLM_TYPE_3B; break;
2223                    case 44:
2224                        type = LLM_TYPE_7B; break;
2225                    case 72:
2226                        type = LLM_TYPE_34B; break;
2227                    default:
2228                        type = LLM_TYPE_UNKNOWN;
2229                }
2230            } break;
2231        case LLM_ARCH_HUNYUAN_MOE:
2232            {
2233                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2234                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2235                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
2236
2237                switch (hparams.n_layer) {
2238                    case 32: type = LLM_TYPE_A13B; break;
2239                    default: type = LLM_TYPE_UNKNOWN;
2240                }
2241            } break;
2242        case LLM_ARCH_HUNYUAN_DENSE:
2243            {
2244                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2245
2246                switch (hparams.n_embd) {
2247                    case 1024: type = LLM_TYPE_0_5B; break;
2248                    case 2048: type = LLM_TYPE_1_8B; break;
2249                    case 3072: type = LLM_TYPE_4B; break;
2250                    case 4096: type = LLM_TYPE_7B; break;
2251                    default: type = LLM_TYPE_UNKNOWN;
2252                }
2253            } break;
2254        case LLM_ARCH_SMOLLM3:
2255            {
2256                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2257                hparams.n_no_rope_layer_step = 4;
2258
2259                switch (hparams.n_layer) {
2260                    case 36: type = LLM_TYPE_3B; break;
2261                    default: type = LLM_TYPE_UNKNOWN;
2262                }
2263            } break;
2264        case LLM_ARCH_OPENAI_MOE:
2265            {
2266                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2267                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2268                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
2269
2270                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2271                hparams.set_swa_pattern(2);
2272
2273                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
2274                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2275                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2276
2277                switch (hparams.n_layer) {
2278                    case 24: type = LLM_TYPE_20B; break;
2279                    case 36: type = LLM_TYPE_120B; break;
2280                    default: type = LLM_TYPE_UNKNOWN;
2281                }
2282            } break;
2283        case LLM_ARCH_LFM2:
2284            {
2285                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
2286                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2287                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2288                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2289                }
2290                hparams.n_layer_dense_lead = hparams.n_layer;
2291                switch (hparams.n_ff()) {
2292                    case  4608: type = LLM_TYPE_350M; break;
2293                    case  6912: type = LLM_TYPE_700M; break;
2294                    case  8192: type = LLM_TYPE_1_2B; break;
2295                    case 10752: type = LLM_TYPE_2_6B; break;
2296                    default:    type = LLM_TYPE_UNKNOWN;
2297                }
2298            } break;
2299        case LLM_ARCH_LFM2MOE:
2300            {
2301                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
2302                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2303                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
2304                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2305                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
2306
2307                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2308                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2309                }
2310
2311                type = LLM_TYPE_8B_A1B;
2312            } break;
2313        case LLM_ARCH_SMALLTHINKER:
2314            {
2315                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
2316
2317                if (found_swa && hparams.n_swa > 0) {
2318                    hparams.swa_type      = LLAMA_SWA_TYPE_STANDARD;
2319                    hparams.n_swa         = 4096;
2320                    hparams.set_swa_pattern(4, true);
2321
2322                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
2323                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2324                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2325                } else {
2326                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
2327                    hparams.n_no_rope_layer_step = hparams.n_layer;
2328                }
2329
2330                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
2331                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2332                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2333
2334                switch (hparams.n_layer) {
2335                    case 32: type = LLM_TYPE_4B;  break;
2336                    case 52: type = LLM_TYPE_20B; break;
2337                    default: type = LLM_TYPE_UNKNOWN;
2338                }
2339            } break;
2340        case LLM_ARCH_GROVEMOE:
2341            {
2342                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2343                ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,  hparams.n_ff_chexp);
2344                ml.get_key(LLM_KV_EXPERT_GROUP_SCALE,                hparams.expert_group_scale);
2345                ml.get_key(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
2346                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2347
2348                switch (hparams.n_layer) {
2349                    case 48: type = LLM_TYPE_30B_A3B; break;
2350                    default: type = LLM_TYPE_UNKNOWN;
2351                }
2352            } break;
2353        case LLM_ARCH_APERTUS:
2354            {
2355                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2356                ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N,        hparams.xielu_alpha_n, hparams.n_layer);
2357                ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P,        hparams.xielu_alpha_p, hparams.n_layer);
2358                ml.get_key_or_arr(LLM_KV_XIELU_BETA,           hparams.xielu_beta,    hparams.n_layer);
2359                ml.get_key_or_arr(LLM_KV_XIELU_EPS,            hparams.xielu_eps,     hparams.n_layer);
2360
2361                switch (hparams.n_layer) {
2362                    case 32: type = LLM_TYPE_8B; break;
2363                    default: type = LLM_TYPE_UNKNOWN;
2364                }
2365            } break;
2366        case LLM_ARCH_MINIMAX_M2:
2367            {
2368                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
2369                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp);
2370                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,           hparams.expert_gating_func, false);
2371
2372                switch (hparams.n_layer) {
2373                    case 62: type = LLM_TYPE_230B_A10B; break;
2374                    default: type = LLM_TYPE_UNKNOWN;
2375                }
2376            } break;
2377        case LLM_ARCH_COGVLM:
2378            {
2379                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2380                switch (hparams.n_layer) {
2381                    case 32: type = LLM_TYPE_13B; break;
2382                    default: type = LLM_TYPE_UNKNOWN;
2383                }
2384            } break;
2385        case LLM_ARCH_PANGU_EMBED:
2386            {
2387                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2388                switch (hparams.n_layer) {
2389                    case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
2390                    case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
2391                    default: type = LLM_TYPE_UNKNOWN;
2392                }
2393            } break;
2394        case LLM_ARCH_QWEN3NEXT:
2395            {
2396                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
2397                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2398                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2399
2400                // Load linear attention (gated delta net) parameters
2401                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2402                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2403                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2404                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2405                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2406
2407                // Mark recurrent layers (linear attention layers)
2408                {
2409                    uint32_t full_attn_interval = 4;
2410                    ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2411                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2412                        hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2413                    }
2414                }
2415
2416                switch (hparams.n_layer) {
2417                    case 48: type = LLM_TYPE_80B_A3B; break;
2418                    default: type = LLM_TYPE_UNKNOWN;
2419                }
2420            } break;
2421        case LLM_ARCH_QWEN35:
2422            {
2423                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2424                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS,    hparams.rope_sections, 4, true);
2425
2426                // Load linear attention (gated delta net) parameters
2427                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2428                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2429                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2430                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2431                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2432
2433                // Mark recurrent layers (linear attention layers)
2434                {
2435                    uint32_t full_attn_interval = 4;
2436                    ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2437                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2438                        hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2439                    }
2440                }
2441
2442                switch (hparams.n_layer) {
2443                    case 24: type = LLM_TYPE_2B; break;
2444                    default: type = LLM_TYPE_UNKNOWN;
2445                }
2446            } break;
2447        case LLM_ARCH_QWEN35MOE:
2448            {
2449                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
2450                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2451                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2452
2453                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS,    hparams.rope_sections, 4, true);
2454
2455                // Load linear attention (gated delta net) parameters
2456                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2457                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2458                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2459                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2460                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2461
2462                // Mark recurrent layers (linear attention layers)
2463                {
2464                    uint32_t full_attn_interval = 4;
2465                    ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2466                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2467                        hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2468                    }
2469                }
2470
2471                switch (hparams.n_layer) {
2472                    case 28: type = LLM_TYPE_35B_A3B; break;
2473                    case 48: type = LLM_TYPE_80B_A3B; break;
2474                    default: type = LLM_TYPE_UNKNOWN;
2475                }
2476            } break;
2477        case LLM_ARCH_MISTRAL3:
2478            {
2479                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2480                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
2481
2482                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
2483                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
2484                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, 0.0f);
2485
2486                hparams.f_attn_temp_offset = 0.0f;
2487
2488                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
2489                if (hparams.f_attn_temp_scale != 0.0f) {
2490                    hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
2491                    if (hparams.n_attn_temp_floor_scale == 0) {
2492                        throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
2493                    }
2494                }
2495
2496                switch (hparams.n_layer) {
2497                    case 26: type = LLM_TYPE_3B; break;
2498                    case 34: type = LLM_TYPE_8B; break;
2499                    case 40: type = LLM_TYPE_14B; break;
2500                    default: type = LLM_TYPE_UNKNOWN;
2501                }
2502            } break;
2503        case LLM_ARCH_MIMO2:
2504            {
2505                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2506
2507                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2508
2509                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2510                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,   hparams.n_swa);
2511                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,         hparams.rope_freq_base_train_swa);
2512                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
2513
2514                switch (hparams.n_layer) {
2515                    case 48: type = LLM_TYPE_310B_A15B; break;
2516                    default: type = LLM_TYPE_UNKNOWN;
2517                }
2518            } break;
2519        case LLM_ARCH_KIMI_LINEAR:
2520            {
2521                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2522                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,    hparams.n_embd_head_k_mla_impl);
2523                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla_impl);
2524                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
2525                ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,        hparams.n_rot);
2526                ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv);
2527                ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.n_embd_head_kda);
2528
2529                // MLA qk_rope_head_dim (for reference)
2530                // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
2531
2532                // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
2533                // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
2534                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2535                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;  // KDA layers are recurrent
2536                }
2537
2538                // MoE parameters - Kimi uses moe_intermediate_size = 1024
2539                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2540                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
2541                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
2542                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
2543                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
2544
2545                switch (hparams.n_layer) {
2546                    case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
2547                    default: type = LLM_TYPE_UNKNOWN;
2548                }
2549            } break;
2550        case LLM_ARCH_STEP35:
2551            {
2552                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2553
2554                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2555
2556                // MoE + SWA parameters
2557                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2558                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2559                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
2560                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
2561                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
2562
2563                // Step35 uses sigmoid gating by default (if not set in GGUF)
2564                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
2565                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
2566                }
2567
2568                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,  hparams.n_swa);
2569                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,        hparams.rope_freq_base_train_swa);
2570                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
2571                ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP,   hparams.swiglu_clamp_exp,   hparams.n_layer, false);
2572                ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
2573
2574                switch (hparams.n_layer) {
2575                    case 45: type = LLM_TYPE_196B_A11B; break;
2576                    default: type = LLM_TYPE_UNKNOWN;
2577                }
2578            } break;
2579        default: throw std::runtime_error("unsupported model architecture");
2580    }
2581
2582    pimpl->n_bytes = ml.n_bytes;
2583
2584    pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
2585
2586    if (hparams.f_max_alibi_bias > 0.0f) {
2587        hparams.use_alibi = true;
2588    }
2589
2590    hparams.rope_type = llama_model_rope_type(this);
2591}
2592
2593void llama_model::load_vocab(llama_model_loader & ml) {
2594    const auto kv = LLM_KV(arch);
2595
2596    vocab.load(ml, kv);
2597}
2598
2599bool llama_model::load_tensors(llama_model_loader & ml) {
2600    const auto & split_mode   = params.split_mode;
2601    const auto & use_mlock    = params.use_mlock;
2602    const auto & tensor_split = params.tensor_split;
2603
2604    const int n_layer      = hparams.n_layer;
2605    const int n_gpu_layers = this->n_gpu_layers();
2606
2607    const bool use_mmap_buffer = true;
2608
2609    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
2610        __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
2611
2612    // build a list of buffer types for the CPU and GPU devices
2613    pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
2614    for (auto * dev : devices) {
2615        buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
2616        // add CPU buffer types as a fallback
2617        buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
2618        pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
2619    }
2620
2621    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2622    if (cpu_dev == nullptr) {
2623        throw std::runtime_error(format("%s: no CPU backend found", __func__));
2624    }
2625
2626    // calculate the split points
2627    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
2628    std::vector<float> splits(n_devices());
2629    if (all_zero) {
2630        // default split, by free memory
2631        for (size_t i = 0; i < n_devices(); ++i) {
2632            ggml_backend_dev_t dev = devices[i];
2633            size_t total;
2634            size_t free;
2635            ggml_backend_dev_memory(dev, &free, &total);
2636
2637            // devices can return 0 bytes for free and total memory if they do not
2638            // have any to report. in this case, we will use the host memory as a fallback
2639            // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
2640            if (free == 0 && total == 0) {
2641                ggml_backend_dev_memory(cpu_dev, &free, &total);
2642            }
2643            splits[i] = free;
2644        }
2645    } else {
2646        std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
2647    }
2648
2649    // sum and normalize the splits to get the split points
2650    float split_sum = 0.0f;
2651    for (size_t i = 0; i < n_devices(); ++i) {
2652        split_sum += splits[i];
2653        splits[i] = split_sum;
2654    }
2655    for (size_t i = 0; i < n_devices(); ++i) {
2656        splits[i] /= split_sum;
2657    }
2658
2659    const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
2660    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
2661    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
2662        const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
2663        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
2664            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
2665            return {cpu_dev, &pimpl->cpu_buft_list};
2666        }
2667        const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
2668        auto * dev = devices.at(layer_gpu);
2669        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
2670        return {dev, &pimpl->gpu_buft_list.at(dev)};
2671    };
2672
2673    // assign the input layer
2674    // there is very little benefit to offloading the input layer, so always keep it on the CPU
2675    pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
2676
2677    // assign the repeating layers to the devices according to the splits
2678    pimpl->dev_layer.resize(n_layer);
2679    for (int il = 0; il < n_layer; ++il) {
2680        pimpl->dev_layer[il] = get_layer_buft_list(il);
2681    }
2682
2683    // assign the output layer
2684    pimpl->dev_output = get_layer_buft_list(n_layer);
2685
2686    // one ggml context per buffer type
2687    int max_n_tensors = ml.n_tensors;
2688    max_n_tensors += 1;         // duplicated output tensor
2689    max_n_tensors += n_layer*2; // duplicated rope freq tensors
2690    const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
2691
2692    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2693    struct ggml_backend_buft_comparator {
2694        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2695            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
2696        }
2697    };
2698    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
2699
2700    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
2701        auto it = ctx_map.find(buft);
2702        if (it == ctx_map.end()) {
2703            ggml_init_params params = {
2704                /*.mem_size   =*/ ctx_size,
2705                /*.mem_buffer =*/ NULL,
2706                /*.no_alloc   =*/ true,
2707            };
2708
2709            ggml_context * ctx = ggml_init(params);
2710            if (!ctx) {
2711                throw std::runtime_error(format("failed to create ggml context"));
2712            }
2713
2714            ctx_map.emplace(buft, ctx);
2715
2716            return ctx;
2717        }
2718        return it->second.get();
2719    };
2720
2721    const auto TENSOR_DUPLICATED   = llama_model_loader::TENSOR_DUPLICATED;
2722    const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2723    const auto TENSOR_SKIP         = llama_model_loader::TENSOR_SKIP;
2724
2725    // create tensors for the weights
2726    {
2727        // note: cast to int64_t since we will use these for the tensor dimensions
2728        const int64_t n_head        = hparams.n_head();
2729        const int64_t n_head_kv     = hparams.n_head_kv();
2730        const int64_t n_embd        = hparams.n_embd;
2731        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
2732        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
2733        const int64_t n_embd_head_k = hparams.n_embd_head_k;
2734        const int64_t n_embd_head_v = hparams.n_embd_head_v;
2735        const int64_t n_ff          = hparams.n_ff();
2736        const int64_t n_embd_gqa    = n_embd_v_gqa;
2737        const int64_t n_vocab       = vocab.n_tokens();
2738        const int64_t n_token_types = vocab.n_token_types();
2739        const int64_t n_rot         = hparams.n_rot;
2740        const int64_t n_expert      = hparams.n_expert;
2741        const int64_t n_expert_used = hparams.n_expert_used;
2742        const int64_t n_ctx_train   = hparams.n_ctx_train;
2743
2744        if (n_expert > 0 && hparams.n_expert_used == 0) {
2745            throw std::runtime_error("model has expert layers but no expert layers are used");
2746        }
2747
2748        int n_moved_tensors = 0;
2749        ggml_tensor * first_moved_tensor = nullptr;
2750        ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
2751        ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
2752
2753        auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
2754            ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
2755
2756            if (!t_meta) {
2757                if (flags & TENSOR_NOT_REQUIRED) {
2758                    return nullptr;
2759                }
2760                throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
2761            }
2762
2763            // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
2764            // the tensor is duplicated
2765            // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
2766            llm_tensor tn_tensor = tn.tensor;
2767            if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
2768                tn_tensor = LLM_TENSOR_OUTPUT;
2769            }
2770
2771            llm_tensor_info info;
2772            try {
2773                info = llm_tensor_info_for(tn_tensor);
2774            } catch (const std::out_of_range & e) {
2775                throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
2776            }
2777
2778            // skip unused tensors
2779            if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
2780                const size_t nbytes = ggml_nbytes(t_meta);
2781                LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
2782
2783                ml.size_data -= nbytes;
2784                ml.n_created++;
2785
2786                return nullptr;
2787            }
2788
2789            // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
2790            ggml_op op;
2791            bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
2792            if (bias) {
2793                if (info.op == GGML_OP_MUL_MAT_ID) {
2794                    op = GGML_OP_ADD_ID;
2795                } else {
2796                    op = GGML_OP_ADD;
2797                }
2798            } else {
2799                op = info.op;
2800            }
2801
2802            // sanity checks
2803            if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
2804                if (tn.bid != -1) {
2805                    GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
2806                }
2807            } else {
2808                if (tn.bid == -1) {
2809                    GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
2810                }
2811            }
2812
2813            // select the buffer type for this tensor
2814            buft_list_t * buft_list;
2815            switch (info.layer) {
2816                case LLM_TENSOR_LAYER_INPUT:
2817                    buft_list = pimpl->dev_input.buft_list;
2818                    break;
2819                case LLM_TENSOR_LAYER_OUTPUT:
2820                    buft_list = pimpl->dev_output.buft_list;
2821                    break;
2822                case LLM_TENSOR_LAYER_REPEATING:
2823                    buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
2824                    break;
2825                default:
2826                    GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
2827            }
2828
2829            ggml_backend_buffer_type_t buft = nullptr;
2830
2831            // check overrides
2832            if (ml.tensor_buft_overrides) {
2833                std::string tensor_name = tn.str();
2834                for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2835                    std::regex pattern(overrides->pattern);
2836                    if (std::regex_search(tensor_name, pattern)) {
2837                        if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2838                            // when overriding to a CPU buffer, consider the extra buffer types
2839                            buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2840                        } else {
2841                            buft = overrides->buft;
2842                        }
2843
2844                        LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2845                                tensor_name.c_str(),
2846                                ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
2847                                ggml_backend_buft_name(buft));
2848                        break;
2849                    }
2850                }
2851            }
2852
2853            if (!buft) {
2854                buft = select_weight_buft(hparams, t_meta, op, *buft_list);
2855                if (!buft) {
2856                    throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
2857                }
2858            }
2859
2860            // avoid using a host buffer when using mmap
2861            auto * buft_dev = ggml_backend_buft_get_device(buft);
2862            if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2863                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2864                if (!cpu_dev) {
2865                    throw std::runtime_error("no CPU backend found");
2866                }
2867                buft = ggml_backend_dev_buffer_type(cpu_dev);
2868            }
2869
2870            if (buft != buft_list->front().second) {
2871                n_moved_tensors++;
2872                if (!first_moved_tensor) {
2873                    first_moved_tensor = t_meta;
2874                    first_moved_from_buft = buft_list->front().second;
2875                    first_moved_to_buft   = buft;
2876                }
2877            }
2878
2879            ggml_context * ctx = ctx_for_buft(buft);
2880
2881            // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
2882            if (flags & TENSOR_DUPLICATED) {
2883                ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
2884                if (t) {
2885                    return t;
2886                }
2887            }
2888            return ml.create_tensor(ctx, tn, ne, flags);
2889        };
2890
2891        layers.resize(n_layer);
2892
2893        // TODO: move to a separate function
2894        const auto tn = LLM_TN(arch);
2895        switch (arch) {
2896            case LLM_ARCH_LLAMA:
2897            case LLM_ARCH_REFACT:
2898            case LLM_ARCH_MINICPM:
2899            case LLM_ARCH_GRANITE:
2900            case LLM_ARCH_GRANITE_MOE:
2901            case LLM_ARCH_MISTRAL3:
2902            case LLM_ARCH_LLAMA_EMBED:
2903                {
2904                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2905
2906                    // output
2907                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2908                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2909
2910                    // if output is NULL, init from the input tok embed
2911                    if (output == NULL) {
2912                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2913                    }
2914
2915                    for (int i = 0; i < n_layer; ++i) {
2916                        auto & layer = layers[i];
2917
2918                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2919
2920                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2921                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
2922                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
2923                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2924
2925                        // optional bias tensors
2926                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
2927                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2928                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2929                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
2930
2931                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2932
2933                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
2934                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2935                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2936                        }
2937                        else {
2938                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2939                        }
2940
2941                        if (n_expert == 0) {
2942                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
2943                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2944                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2945
2946                            // optional MLP bias
2947                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2948                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2949                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2950                        } else {
2951                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
2952                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
2953                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
2954                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
2955
2956                            // For Granite MoE Shared
2957                            if (hparams.n_ff_shexp > 0) {
2958                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
2959                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
2960                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
2961                            }
2962                        }
2963                    }
2964                } break;
2965            case LLM_ARCH_LLADA:
2966                {
2967                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2968
2969                    // output
2970                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2971                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
2972
2973                    // if output is NULL, init from the input tok embed
2974                    if (output == NULL) {
2975                        output =
2976                            create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
2977                    }
2978
2979                    for (int i = 0; i < n_layer; ++i) {
2980                        auto & layer = layers[i];
2981
2982                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2983
2984                        // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
2985                        layer.wq =
2986                            create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
2987                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
2988                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
2989                        // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
2990                        layer.wo =
2991                            create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
2992                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2993
2994                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2995
2996                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
2997                                                         TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2998
2999                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
3000                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
3001                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
3002
3003                        // optional MLP bias
3004                        layer.ffn_gate_b =
3005                            create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
3006                        layer.ffn_down_b =
3007                            create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
3008                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
3009                    }
3010                }
3011                break;
3012            case LLM_ARCH_LLADA_MOE:
3013                {
3014                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3015
3016                    // output
3017                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3018                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3019
3020                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
3021                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
3022
3023                    for (int i = 0; i < n_layer; ++i) {
3024                        auto & layer = layers[i];
3025
3026                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3027
3028                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3029                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3030                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3031                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3032                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3033                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3034
3035                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3036
3037                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3038
3039                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3040
3041                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3042                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
3043                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3044                    }
3045                } break;
3046            case LLM_ARCH_LLAMA4:
3047                {
3048                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3049
3050                    // output
3051                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3052                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3053
3054                    // if output is NULL, init from the input tok embed
3055                    if (output == NULL) {
3056                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3057                    }
3058
3059                    for (int i = 0; i < n_layer; ++i) {
3060                        bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
3061
3062                        auto & layer = layers[i];
3063
3064                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3065
3066                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3067                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3068                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3069                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3070
3071                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3072
3073                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3074
3075                        if (is_moe_layer) {
3076                            int n_ff_exp = hparams.n_ff_exp;
3077
3078                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
3079                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
3080                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
3081                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
3082
3083                            // Shared expert
3084                            const int64_t n_ff_shexp = n_ff_exp;
3085                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
3086                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd    }, 0);
3087                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
3088                        } else {
3089                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3090                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3091                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3092                        }
3093                    }
3094                } break;
3095            case LLM_ARCH_DECI:
3096                {
3097                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3098
3099                    // output
3100                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3101                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3102
3103                    // if output is NULL, init from the input tok embed
3104                    if (output == NULL) {
3105                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3106                    }
3107
3108                    for (int i = 0; i < n_layer; ++i) {
3109                        auto & layer = layers[i];
3110                        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
3111                        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
3112                        const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(i);
3113                        const int64_t n_ff          = hparams.n_ff(i);
3114                        const int64_t n_head        = hparams.n_head(i);
3115                        const int64_t n_head_kv     = hparams.n_head_kv(i);
3116
3117                        if (n_head_kv == 0 && n_head > 0) {
3118                            // linear attention for DeciLMCausalModel
3119                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3120                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3121                        }
3122                        else if (n_head_kv > 0) {
3123                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3124
3125                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3126                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3127                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3128                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3129                        }
3130
3131                        // optional bias tensors
3132                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
3133                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3134                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3135                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
3136
3137                        if (n_ff > 0) {
3138                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3139                        }
3140
3141                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
3142                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3143                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3144                        }
3145                        else {
3146                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3147                        }
3148
3149                        if (n_ff > 0) {
3150                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3151                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3152                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3153                        }
3154
3155                        // optional MLP bias
3156                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3157                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3158                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3159                    }
3160                } break;
3161            case LLM_ARCH_MINICPM3:
3162                {
3163                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
3164                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
3165
3166                    const int64_t q_lora_rank  = hparams.n_lora_q;
3167                    const int64_t kv_lora_rank = hparams.n_lora_kv;
3168                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3169
3170                    // output
3171                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3172                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3173
3174                    // if output is NULL, init from the input tok embed
3175                    if (output == NULL) {
3176                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3177                    }
3178
3179                    for (int i = 0; i < n_layer; ++i) {
3180                        auto & layer = layers[i];
3181
3182                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3183                        layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
3184
3185                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
3186
3187                        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
3188                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
3189
3190                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3191                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3192                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
3193
3194                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3195
3196                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3197                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3198                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3199
3200                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3201                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3202                    }
3203                } break;
3204            case LLM_ARCH_GROK:
3205                {
3206                    if (n_expert == 0) {
3207                        throw std::runtime_error("Grok model cannot have zero experts");
3208                    }
3209
3210                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3211
3212                    // output
3213                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3214                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3215
3216                    // if output is NULL, init from the input tok embed
3217                    if (output == NULL) {
3218                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3219                    }
3220
3221                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
3222                    for (int i = 0; i < n_layer; ++i) {
3223                        auto & layer = layers[i];
3224
3225                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3226
3227                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3228                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3229                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3230                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3231
3232                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3233
3234                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3235
3236                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3237                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff,   n_embd}, TENSOR_NOT_REQUIRED);
3238                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3239
3240                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
3241                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
3242                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd,   n_expert}, 0);
3243                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
3244
3245                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3246                        if (!layer.ffn_post_norm) {
3247                            layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3248                        }
3249                    }
3250                } break;
3251            case LLM_ARCH_DBRX:
3252                {
3253                    if (n_expert == 0) {
3254                        throw std::runtime_error("DBRX model cannot have zero experts");
3255                    }
3256
3257                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3258
3259                    // output
3260                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3261                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3262
3263                    for (int i = 0; i < n_layer; ++i) {
3264                        auto & layer = layers[i];
3265
3266                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3267
3268                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3269                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3270
3271                        layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3272
3273                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
3274                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
3275                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
3276                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
3277                    }
3278                } break;
3279            case LLM_ARCH_BAICHUAN:
3280                {
3281                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3282                    {
3283                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3284                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3285                    }
3286
3287                    for (int i = 0; i < n_layer; ++i) {
3288                        auto & layer = layers[i];
3289
3290                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3291
3292                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3293                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3294                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3295                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3296
3297                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3298
3299                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3300                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3301                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3302                    }
3303                } break;
3304            case LLM_ARCH_FALCON:
3305                {
3306                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3307
3308                    // output
3309                    {
3310                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3311                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3312
3313                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3314                        if (!output) {
3315                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
3316                        }
3317                    }
3318
3319                    for (int i = 0; i < n_layer; ++i) {
3320                        auto & layer = layers[i];
3321
3322                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3323                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3324
3325                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3326                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3327
3328                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3329                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3330
3331                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3332                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3333                    }
3334                } break;
3335            case LLM_ARCH_STARCODER:
3336                {
3337                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3338                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
3339
3340                    // output
3341                    {
3342                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3343                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3344                        output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3345                        if (!output) {
3346                            // needs to be on GPU
3347                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3348                        }
3349
3350                    }
3351
3352                    for (int i = 0; i < n_layer; ++i) {
3353                        auto & layer = layers[i];
3354
3355                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3356                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3357
3358                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3359                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
3360
3361                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3362                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
3363
3364                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3365                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
3366
3367                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3368                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
3369
3370                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
3371                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
3372                    }
3373                } break;
3374            case LLM_ARCH_BERT:
3375            case LLM_ARCH_NOMIC_BERT:
3376            case LLM_ARCH_NOMIC_BERT_MOE:
3377            case LLM_ARCH_JINA_BERT_V3:
3378                {
3379                    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
3380                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
3381
3382                    if (arch == LLM_ARCH_BERT) {
3383                        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
3384
3385                        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3386                        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
3387
3388                        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3389                        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
3390                    }
3391
3392                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3393                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
3394
3395                    for (int i = 0; i < n_layer; ++i) {
3396                        auto & layer = layers[i];
3397
3398                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3399                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3400
3401                        if (!layer.wqkv) {
3402                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3403                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd}, 0);
3404
3405                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3406                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i),   {n_embd_gqa}, 0);
3407
3408                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3409                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa}, 0);
3410                        }
3411
3412                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
3413                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3414
3415                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3416                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
3417
3418                        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
3419                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
3420                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
3421                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
3422                        } else {
3423                            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
3424                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
3425                            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3426                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3427
3428                            if (arch == LLM_ARCH_NOMIC_BERT) {
3429                                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3430                            }
3431                        }
3432
3433                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3434                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
3435                    }
3436                } break;
3437            case LLM_ARCH_MODERN_BERT:
3438                {
3439                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3440                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3441
3442                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3443
3444                    for(int i = 0; i < n_layer; ++i) {
3445                        auto& layer = layers[i];
3446
3447                        if ( i != 0 ) {
3448                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3449                        } else{
3450                            // layer 0 uses identity
3451                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3452                        }
3453
3454
3455                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
3456                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,   "weight", i), {n_embd, n_embd}, 0);
3457
3458                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, 2 * n_ff}, 0);
3459                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3460                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3461                    }
3462
3463                    cls       = create_tensor(tn(LLM_TENSOR_CLS,     "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3464                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3465                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
3466
3467                } break;
3468            case LLM_ARCH_NEO_BERT:
3469                {
3470                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
3471
3472                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3473                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
3474
3475                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3476                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
3477
3478                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
3479
3480                    for (int i = 0; i < n_layer; ++i) {
3481                        auto & layer = layers[i];
3482
3483                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3484
3485                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3486                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3487
3488                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3489
3490                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff*2}, 0);
3491                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3492                    }
3493                } break;
3494            case LLM_ARCH_JINA_BERT_V2:
3495                {
3496                    tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
3497                    type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
3498
3499                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
3500                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0); //LayerNorm bias
3501
3502                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
3503                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {1},         TENSOR_NOT_REQUIRED);
3504                    for (int i = 0; i < n_layer; ++i) {
3505                        auto & layer = layers[i]; // JinaBertLayer
3506
3507                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3508                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
3509
3510                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3511                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3512
3513                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3514                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
3515
3516                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3517                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3518
3519                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3520                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
3521
3522                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
3523                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0); //output_dens
3524
3525                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
3526                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd}, 0);
3527
3528                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3529                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3530
3531                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3532
3533                        const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
3534                        ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
3535                        const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
3536
3537                        GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
3538                        layer.ffn_up   = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
3539                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
3540
3541                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3542                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
3543
3544                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3545                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias",   i), {n_embd}, 0);
3546                    }
3547                } break;
3548            case LLM_ARCH_BLOOM:
3549                {
3550                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
3551                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3552                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
3553
3554                    // output
3555                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3556                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3557                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3558
3559                    // if output is NULL, init from the input tok embed
3560                    if (output == NULL) {
3561                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3562                    }
3563
3564                    for (int i = 0; i < n_layer; ++i) {
3565                        auto & layer = layers[i];
3566
3567                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3568                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd}, 0);
3569
3570                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3571                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias",   i), {n_embd + 2*n_embd_gqa}, 0);
3572
3573                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3574                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);
3575
3576                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3577                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd}, 0);
3578
3579                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3580                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
3581
3582                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3583                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias",   i), {n_ff}, 0);
3584                    }
3585                } break;
3586            case LLM_ARCH_MPT:
3587                {
3588                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3589                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
3590
3591                    // output
3592                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3593                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
3594
3595                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3596                    if (!output) {
3597                        output    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
3598                    }
3599
3600                    for (int i = 0; i < n_layer; ++i) {
3601                        auto & layer = layers[i];
3602
3603                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3604                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3605
3606                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3607                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3608
3609                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3610                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3611
3612                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3613                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3614
3615                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3616                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3617
3618                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3619                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
3620
3621                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3622                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3623
3624                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3625                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3626
3627                        // AWQ ScaleActivation layer
3628                        layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
3629                    }
3630                } break;
3631            case LLM_ARCH_STABLELM:
3632                {
3633                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3634
3635                    // output
3636                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3637                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3638                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3639
3640                    for (int i = 0; i < n_layer; ++i) {
3641                        auto & layer = layers[i];
3642
3643                        layer.attn_norm =   create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3644                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3645
3646                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3647                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3648                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3649                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3650
3651                        // optional bias tensors, present in Stable LM 2 1.6B
3652                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
3653                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3654                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3655
3656                        // optional q and k layernorms, present in StableLM 2 12B
3657                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    TENSOR_NOT_REQUIRED);
3658                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
3659
3660                        // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
3661                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3662                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3663
3664                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3665                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3666                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3667                    }
3668                } break;
3669            case LLM_ARCH_QWEN:
3670                {
3671                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3672
3673                    // output
3674                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3675                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3676
3677                    for (int i = 0; i < n_layer; ++i) {
3678                        auto & layer = layers[i];
3679
3680                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3681
3682                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
3683                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3}, 0);
3684                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3685
3686                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3687
3688                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
3689                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
3690                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2}, 0);
3691                    }
3692                } break;
3693            case LLM_ARCH_QWEN2:
3694            case LLM_ARCH_QWEN2VL:
3695            case LLM_ARCH_DREAM:
3696                {
3697                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3698
3699                    // output
3700                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3701                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3702                    output_b    = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, TENSOR_NOT_REQUIRED);
3703                    // if output is NULL, init from the input tok embed
3704                    if (output == NULL) {
3705                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3706                    }
3707
3708                    for (int i = 0; i < n_layer; ++i) {
3709                        auto & layer = layers[i];
3710
3711                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3712
3713                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3714                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3715                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3716                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3717
3718                        // optional bias tensors
3719                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3720                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3721                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3722
3723                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3724
3725                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3726                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3727                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3728                    }
3729                } break;
3730            case LLM_ARCH_QWEN2MOE:
3731                {
3732                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3733
3734                    // output
3735                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3736                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3737
3738                    for (int i = 0; i < n_layer; ++i) {
3739                        auto & layer = layers[i];
3740
3741                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3742
3743                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3744                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3745                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3746                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3747
3748                        // optional bias tensors
3749                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3750                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3751                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3752
3753                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3754
3755                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3756
3757                        if (n_expert == 0) {
3758                            throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
3759                        }
3760                        if (n_expert_used == 0) {
3761                            throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
3762                        }
3763
3764                        // MoE branch
3765                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3766
3767                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3768                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
3769                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3770
3771                        // Shared expert branch
3772                        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
3773
3774                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
3775                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
3776                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp,     n_embd}, 0);
3777                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
3778                    }
3779                } break;
3780            case LLM_ARCH_QWEN3:
3781            case LLM_ARCH_QWEN3VL:
3782                {
3783                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3784
3785                    // output
3786                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3787                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3788                    // if output is NULL, init from the input tok embed
3789                    if (output == NULL) {
3790                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3791                    }
3792
3793                    // output rerank head
3794                    cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3795
3796                    for (int i = 0; i < n_layer; ++i) {
3797                        auto & layer = layers[i];
3798
3799                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3800
3801                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3802                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3803                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3804                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3805
3806                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3807                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3808
3809                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3810                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3811                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3812                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3813                    }
3814                } break;
3815            case LLM_ARCH_QWEN3MOE:
3816            case LLM_ARCH_QWEN3VLMOE:
3817            case LLM_ARCH_RND1:
3818                {
3819                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3820
3821                    // output
3822                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3823                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3824                    // if output is NULL, init from the input tok embed
3825                    if (output == NULL) {
3826                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3827                    }
3828
3829                    for (int i = 0; i < n_layer; ++i) {
3830                        auto & layer = layers[i];
3831
3832                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3833
3834                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3835                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3836                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3837                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3838
3839                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3840                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3841
3842                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3843
3844                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3845
3846                        if (n_expert == 0) {
3847                            throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
3848                        }
3849                        if (n_expert_used == 0) {
3850                            throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
3851                        }
3852
3853                        // MoE branch
3854                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3855
3856                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3857                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
3858                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3859                    }
3860                } break;
3861            case LLM_ARCH_PHI2:
3862                {
3863                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3864
3865                    // output
3866                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3867                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3868                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3869                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, 0);
3870
3871                    for (int i = 0; i < n_layer; ++i) {
3872                        auto & layer = layers[i];
3873
3874                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3875                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3876
3877                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3878                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3879
3880                        if (layer.wqkv == nullptr) {
3881                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3882                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
3883
3884                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3885                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa}, 0);
3886
3887                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3888                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa}, 0);
3889                        }
3890
3891                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3892                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
3893
3894                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3895                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
3896
3897                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
3898                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
3899                    }
3900                } break;
3901            case LLM_ARCH_PHI3:
3902                {
3903                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3904
3905                    // output
3906                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3907                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3908
3909                    // if output is NULL, init from the input tok embed
3910                    if (output == NULL) {
3911                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3912                    }
3913
3914                    for (int i = 0; i < n_layer; ++i) {
3915                        auto & layer = layers[i];
3916
3917                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3918
3919                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
3920                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
3921
3922                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3923
3924                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
3925                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
3926
3927                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3928                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3929                    }
3930                } break;
3931            case LLM_ARCH_PHIMOE:
3932                {
3933                    const int64_t n_embd_head = n_embd / n_head;
3934
3935                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3936
3937                    // output
3938                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3939                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3940                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
3941                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   { n_vocab }, 0);
3942
3943                    for (int i = 0; i < n_layer; ++i) {
3944                        auto & layer = layers[i];
3945
3946                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3947                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), { n_embd }, 0);
3948
3949                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
3950                        if (layer.wqkv == nullptr) {
3951                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3952                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias",   i), {n_embd}, 0);
3953
3954                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3955                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
3956
3957                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3958                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
3959                        }
3960                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
3961                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), { n_embd }, 0);
3962
3963                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3964                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), { n_embd }, 0);
3965
3966                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert},         0);
3967                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
3968                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
3969                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
3970
3971                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3972                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3973                     }
3974                } break;
3975            case LLM_ARCH_PLAMO:
3976                {
3977                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3978
3979                    // output
3980                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3981                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3982
3983                    for (int i = 0; i < n_layer; ++i) {
3984                        auto & layer = layers[i];
3985
3986                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3987
3988                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3989                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3990                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3991                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3992
3993                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3994                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3995                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3996                    }
3997                } break;
3998            case LLM_ARCH_PLAMO2:
3999                {
4000                    // mamba parameters
4001                    const uint32_t d_conv             = hparams.ssm_d_conv;
4002                    const uint32_t d_state            = hparams.ssm_d_state;
4003                    const uint32_t num_heads          = hparams.ssm_dt_rank;
4004                    const uint32_t intermediate_size  = hparams.ssm_d_inner;
4005                    const int64_t dt_dim              = std::max(64, int(hparams.n_embd / 16));
4006
4007                    // attention parameters
4008                    const uint32_t qk_dim = hparams.n_embd_head_k;
4009                    const uint32_t v_dim  = hparams.n_embd_head_v;
4010
4011                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4012
4013                    // output
4014                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4015                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4016                    // if output is NULL, init from the input tok embed
4017                    if (output == NULL) {
4018                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4019                    }
4020
4021                    for (int i = 0; i < n_layer; ++i) {
4022                        auto & layer = layers[i];
4023                        bool is_mamba_layer = hparams.is_recurrent(i);
4024
4025                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4026
4027                        if (is_mamba_layer) {
4028                            layer.ssm_in       = create_tensor(tn(LLM_TENSOR_SSM_IN,     "weight", i), {n_embd, 2 * intermediate_size}, 0);
4029                            layer.ssm_conv1d   = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
4030
4031                            layer.ssm_x    = create_tensor(tn(LLM_TENSOR_SSM_X,  "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
4032                            layer.ssm_dt   = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
4033                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
4034
4035                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
4036                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
4037
4038                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
4039
4040                            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
4041                            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
4042                            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
4043                        } else {
4044                            const int64_t num_attention_heads = hparams.n_head(i);
4045                            const int64_t q_num_heads         = num_attention_heads;
4046                            const int64_t num_key_value_heads = hparams.n_head_kv(i);
4047                            const int64_t k_num_heads         = num_key_value_heads;
4048                            const int64_t v_num_heads         = num_key_value_heads;
4049                            const int64_t q_proj_dim          = q_num_heads * qk_dim;
4050                            const int64_t k_proj_dim          = k_num_heads * qk_dim;
4051                            const int64_t v_proj_dim          = v_num_heads * v_dim;
4052
4053                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
4054                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
4055                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
4056                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
4057                        }
4058
4059                        // All layers have post-attention norm, FFN norm, and FFN tensors
4060                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
4061                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4062                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4063                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
4064                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
4065                    }
4066                } break;
4067            case LLM_ARCH_PLAMO3:
4068                {
4069                    const int64_t head_dim_q = hparams.n_embd_head_k;
4070                    const int64_t head_dim_v = hparams.n_embd_head_v;
4071
4072                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4073
4074                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4075                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4076                    if (output == NULL) {
4077                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4078                    }
4079
4080                    for (int i = 0; i < n_layer; ++i) {
4081                        auto & layer = layers[i];
4082
4083                        const int64_t num_attention_heads = hparams.n_head(i);
4084                        const int64_t num_key_value_heads = hparams.n_head_kv(i);
4085                        const int64_t q_proj_dim = num_attention_heads * head_dim_q;
4086                        const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
4087                        const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
4088                        const int64_t n_ff_cur   = hparams.n_ff(i);
4089
4090                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4091                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
4092                                {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
4093                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
4094                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
4095                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
4096                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
4097
4098                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4099                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
4100
4101                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff_cur * 2}, 0);
4102                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
4103                    }
4104                } break;
4105            case LLM_ARCH_GPT2:
4106                {
4107                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4108                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
4109
4110                    // output
4111                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4112                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4113                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4114
4115                    // if output is NULL, init from the input tok embed
4116                    if (output == NULL) {
4117                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4118                    }
4119
4120                    for (int i = 0; i < n_layer; ++i) {
4121                        auto & layer = layers[i];
4122
4123                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
4124                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
4125
4126                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4127                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
4128
4129                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4130                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
4131
4132                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4133                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4134
4135                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4136                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
4137
4138                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4139                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
4140                    }
4141                } break;
4142            case LLM_ARCH_CODESHELL:
4143                {
4144                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4145
4146                    // if tok embd is NULL, init from output
4147                    if (tok_embd == NULL) {
4148                        tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4149                    }
4150
4151                    // output
4152                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4153                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4154                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4155
4156                    for (int i = 0; i < n_layer; ++i) {
4157                        auto & layer = layers[i];
4158
4159                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4160                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4161
4162                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4163                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
4164
4165                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4166                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
4167
4168                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4169                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4170
4171                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4172                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
4173
4174                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
4175                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
4176                    }
4177                } break;
4178            case LLM_ARCH_ORION:
4179                {
4180                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4181
4182                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4183                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4184                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4185
4186                    for (int i = 0; i < n_layer; ++i) {
4187                        auto & layer = layers[i];
4188
4189                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4190                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4191
4192                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4193                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4194                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4195                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4196
4197                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4198                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4199
4200                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4201                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4202                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4203                    }
4204                } break;
4205            case LLM_ARCH_INTERNLM2:
4206                {
4207                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4208
4209                    // output
4210                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4211                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4212
4213                    for (int i = 0; i < n_layer; ++i) {
4214                        auto & layer = layers[i];
4215
4216                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4217                        // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4218                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4219                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4220                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4221
4222                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4223                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4224                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4225                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4226                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4227                    }
4228                } break;
4229            case LLM_ARCH_GEMMA:
4230                {
4231                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4232
4233                    // output
4234                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4235                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
4236
4237                    for (int i = 0; i < n_layer; ++i) {
4238                        auto & layer = layers[i];
4239
4240                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4241
4242                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4243                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4244                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4245                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4246
4247                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4248                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4249                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4250                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4251                    }
4252                } break;
4253            case LLM_ARCH_GEMMA2:
4254                {
4255                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4256
4257                    // output
4258                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4259                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
4260
4261                    for (int i = 0; i < n_layer; ++i) {
4262                        auto & layer = layers[i];
4263
4264                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4265
4266                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4267                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4268                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4269                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4270                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4271
4272                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4273                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4274                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4275                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4276                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4277                    }
4278                } break;
4279            case LLM_ARCH_GEMMA3:
4280            case LLM_ARCH_GEMMA_EMBEDDING:
4281                {
4282                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4283
4284                    // output
4285                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4286                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4287
4288                    // if output is NULL, init from the input tok embed
4289                    if (output == NULL) {
4290                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,   "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4291                    }
4292
4293                    // Dense linear weights
4294                    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
4295                    dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
4296
4297
4298                    for (int i = 0; i < n_layer; ++i) {
4299                        auto & layer = layers[i];
4300
4301                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4302
4303                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4304                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4305                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4306                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4307
4308                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4309                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
4310                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
4311
4312                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4313                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4314                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4315                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4316                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4317                    }
4318                } break;
4319            case LLM_ARCH_GEMMA3N:
4320                {
4321                    const int64_t n_altup      = hparams.n_altup;
4322                    const int64_t laurel_rank  = hparams.laurel_rank;
4323                    const int64_t n_embd_altup = hparams.n_embd_altup;
4324
4325                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4326                    // if output is NULL, init from the input tok embed
4327                    if (output == NULL) {
4328                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4329                    }
4330
4331                    tok_embd           = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,           "weight"), {n_embd, n_vocab}, 0);
4332                    tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
4333
4334                    altup_proj           = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ,           "weight"), {n_embd, n_embd, n_altup - 1}, 0);
4335                    altup_unembd_proj    = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ,    "weight"), {n_embd, n_embd, n_altup - 1}, 0);
4336                    per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
4337                    per_layer_proj_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM,  "weight"), {n_embd_altup}, 0);
4338
4339                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4340
4341                    for (int i = 0; i < n_layer; ++i) {
4342                        auto & layer = layers[i];
4343
4344                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4345
4346                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4347                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4348                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4349                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4350
4351                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
4352                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
4353                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4354
4355                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4356                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4357                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4358                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4359                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4360
4361                        // altup & laurel
4362                        layer.per_layer_inp_gate   = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE,  "weight", i), {n_embd, n_embd_altup}, 0);
4363                        layer.per_layer_proj       = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ,      "weight", i), {n_embd_altup, n_embd}, 0);
4364                        layer.per_layer_post_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
4365                        layer.altup_correct_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF,  "weight", i), {n_altup, n_altup}, 0);
4366                        layer.altup_correct_scale  = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
4367                        layer.altup_predict_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF,  "weight", i), {n_altup, n_altup * n_altup}, 0);
4368                        layer.altup_router         = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER,        "weight", i), {n_embd, n_altup}, 0);
4369                        layer.altup_router_norm    = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM,   "weight", i), {n_embd}, 0);
4370                        layer.laurel_l             = create_tensor(tn(LLM_TENSOR_LAUREL_L,            "weight", i), {n_embd, laurel_rank}, 0);
4371                        layer.laurel_r             = create_tensor(tn(LLM_TENSOR_LAUREL_R,            "weight", i), {laurel_rank, n_embd}, 0);
4372                        layer.laurel_post_norm     = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM,    "weight", i), {n_embd}, 0);
4373                    }
4374                } break;
4375            case LLM_ARCH_STARCODER2:
4376                {
4377                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4378
4379                    // output
4380                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4381                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4382
4383                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4384                    // if output is NULL, init from the input tok embed
4385                    if (output == NULL) {
4386                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4387                    }
4388
4389                    for (int i = 0; i < n_layer; ++i) {
4390                        auto & layer = layers[i];
4391
4392                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4393                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4394
4395                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4396                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4397                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4398                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4399
4400                        // optional bias tensors
4401                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
4402                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
4403                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
4404                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
4405
4406                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4407                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4408
4409                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4410                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4411
4412                        // optional bias tensors
4413                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
4414                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff}, 0);
4415                    }
4416                } break;
4417            case LLM_ARCH_MAMBA:
4418                {
4419                    const int64_t d_conv  = hparams.ssm_d_conv;
4420                    const int64_t d_inner = hparams.ssm_d_inner;
4421                    const int64_t d_state = hparams.ssm_d_state;
4422                    const int64_t dt_rank = hparams.ssm_dt_rank;
4423
4424                    // only an expansion factor of 2 is supported for now
4425                    if (2 * n_embd != d_inner) {
4426                        throw std::runtime_error("only an expansion factor of 2 is supported for now");
4427                    }
4428
4429                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4430
4431                    // output
4432                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4433
4434                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4435                    // if output is NULL, init from the input tok embed, duplicated to allow offloading
4436                    if (output == NULL) {
4437                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4438                    }
4439
4440                    for (int i = 0; i < n_layer; ++i) {
4441                        auto & layer = layers[i];
4442
4443                        // norm
4444                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4445
4446                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
4447
4448                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
4449                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
4450
4451                        layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4452
4453                        layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4454                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4455
4456                        // no "weight" suffix for these
4457                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4458                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4459
4460                        // out_proj
4461                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4462                    }
4463                } break;
4464            case LLM_ARCH_MAMBA2:
4465                {
4466                    const int64_t d_conv  = hparams.ssm_d_conv;
4467                    const int64_t d_inner = hparams.ssm_d_inner;
4468                    const int64_t d_state = hparams.ssm_d_state;
4469                    const int64_t n_head  = hparams.ssm_dt_rank;
4470                    const int64_t n_group = hparams.ssm_n_group;
4471                    const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
4472
4473                    // only an expansion factor of 2 is supported for now
4474                    GGML_ASSERT(2 * n_embd == d_inner);
4475
4476                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4477
4478                    // output
4479                    {
4480                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4481
4482                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4483                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4484                        if (output == NULL) {
4485                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4486                        }
4487                    }
4488
4489                    for (int i = 0; i < n_layer; ++i) {
4490                        auto & layer = layers[i];
4491
4492                        // norm
4493                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4494
4495                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4496
4497                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4498                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
4499
4500                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
4501
4502                        // no "weight" suffix for these
4503                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
4504                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
4505
4506                        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4507
4508                        // out_proj
4509                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4510                    }
4511                } break;
4512            case LLM_ARCH_JAMBA:
4513                {
4514                    const int64_t d_conv  = hparams.ssm_d_conv;
4515                    const int64_t d_inner = hparams.ssm_d_inner;
4516                    const int64_t d_state = hparams.ssm_d_state;
4517                    const int64_t dt_rank = hparams.ssm_dt_rank;
4518
4519                    // only an expansion factor of 2 is supported for now
4520                    GGML_ASSERT(2 * n_embd == d_inner);
4521
4522                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4523
4524                    // output
4525                    {
4526                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4527
4528                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4529                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4530                        if (output == NULL) {
4531                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4532                        }
4533                    }
4534
4535                    for (int i = 0; i < n_layer; ++i) {
4536                        const int64_t n_head_kv = hparams.n_head_kv(i);
4537                        const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
4538
4539                        auto & layer = layers[i];
4540
4541                        // norm
4542                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4543
4544                        if (n_head_kv == 0) {
4545                            // Mamba layer
4546                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
4547
4548                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
4549                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
4550
4551                            layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4552
4553                            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
4554
4555                            layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4556                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4557
4558                            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
4559                            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
4560
4561                            // no "weight" suffix for these
4562                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4563                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4564
4565                            // out_proj
4566                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4567                        } else {
4568                            // Attention layers
4569
4570                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4571                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4572                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4573                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4574                        }
4575
4576                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4577
4578                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
4579
4580                        if (layer.ffn_gate_inp) {
4581                            // MoE
4582                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4583                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
4584                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff, n_expert}, 0);
4585                        } else {
4586                            // FFN (no MoE)
4587                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4588                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4589                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4590                        }
4591                    }
4592                } break;
4593            case LLM_ARCH_GRANITE_HYBRID:
4594                {
4595                    // mamba2 Mixer SSM params
4596                    // NOTE: int64_t for tensor dimensions
4597                    const int64_t d_conv     = hparams.ssm_d_conv;
4598                    const int64_t d_inner    = hparams.ssm_d_inner;
4599                    const int64_t d_state    = hparams.ssm_d_state;
4600                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
4601                    const int64_t n_group    = hparams.ssm_n_group;
4602                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
4603
4604                    // only an expansion factor of 2 is supported for now
4605                    GGML_ASSERT(2 * n_embd == d_inner);
4606
4607                    // embeddings
4608                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4609
4610                    // output
4611                    {
4612                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4613                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4614                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4615                        if (output == NULL) {
4616                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4617                        }
4618                    }
4619
4620                    for (int i = 0; i < n_layer; ++i) {
4621                        auto & layer = layers[i];
4622
4623                        // norm
4624                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4625
4626                        if (hparams.is_recurrent(i)) {
4627                            // ssm layers
4628                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4629
4630                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4631                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
4632
4633                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
4634
4635                            // no "weight" suffix for these
4636                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
4637                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
4638
4639                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4640
4641                            // out_proj
4642                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4643                        } else {
4644                            // attention layers (with optional bias)
4645                            const int64_t n_head_i = hparams.n_head(i);
4646                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
4647                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
4648                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
4649                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
4650                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
4651                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
4652                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
4653                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
4654                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
4655                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
4656                        }
4657
4658                        // feed forward (w/ optional biases)
4659                        if (n_expert > 0) {
4660                            // MoE FFN
4661                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4662                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4663                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
4664                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
4665                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
4666                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
4667
4668                            // For Granite MoE Shared
4669                            if (hparams.n_ff_shexp > 0) {
4670                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4671                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4672                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
4673                            }
4674                        } else {
4675                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4676                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4677                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4678                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4679                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4680                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4681                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4682                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4683                        }
4684                    }
4685                } break;
4686            case LLM_ARCH_XVERSE:
4687                {
4688                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4689
4690                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4691                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4692
4693                    for (int i = 0; i < n_layer; ++i) {
4694                        auto & layer = layers[i];
4695
4696                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4697
4698                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4699                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4700                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4701                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4702
4703                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4704                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4705                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4706                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4707                    }
4708                } break;
4709            case LLM_ARCH_COMMAND_R:
4710                {
4711                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4712
4713                    // output
4714                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4715                    // init output from the input tok embed
4716                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4717
4718                    for (int i = 0; i < n_layer; ++i) {
4719                        auto & layer = layers[i];
4720
4721                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4722
4723                        if (n_layer >= 64){
4724                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
4725                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
4726                        }
4727
4728                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4729                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4730                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4731                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4732
4733                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4734                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4735                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4736                    }
4737                } break;
4738            case LLM_ARCH_COHERE2:
4739                {
4740                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4741
4742                    // output
4743                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4744                    // init output from the input tok embed
4745                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
4746                                                      TENSOR_DUPLICATED);
4747
4748                    for (int i = 0; i < n_layer; ++i) {
4749                        auto & layer = layers[i];
4750
4751                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
4752
4753                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
4754                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
4755                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
4756                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
4757
4758                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
4759                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
4760                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
4761                    }
4762                }
4763                break;
4764            case LLM_ARCH_OLMO:  // adapted from LLM_ARCH_LLAMA with norm params removed
4765                {
4766                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4767
4768                    // output
4769                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4770                    // if output is NULL, init from the input tok embed
4771                    if (output == NULL) {
4772                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4773                    }
4774
4775                    for (int i = 0; i < n_layer; ++i) {
4776                        auto & layer = layers[i];
4777
4778                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4779                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4780                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4781                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4782
4783                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4784                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4785                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4786                    }
4787                } break;
4788            case LLM_ARCH_OLMO2:
4789                {
4790                    const int64_t n_embd_head = n_embd / n_head;
4791
4792                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4793
4794                    // output
4795                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4796                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4797
4798                    for (int i = 0; i < n_layer; ++i) {
4799                        auto & layer = layers[i];
4800
4801                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4802                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4803                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4804                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4805                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
4806                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
4807                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4808
4809                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4810                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4811                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4812                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4813                    }
4814                } break;
4815            case LLM_ARCH_SEED_OSS:
4816                {
4817                    const uint32_t head_dim             = hparams.n_embd_head_k;
4818                    const int64_t n_qo_dim              = n_head * head_dim;
4819                    const int64_t n_kv_dim              = n_head_kv * head_dim;
4820
4821                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4822
4823                    // output
4824                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4825                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4826                    // if output is NULL, init from the input tok embed
4827                    if (output == NULL) {
4828                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4829                    }
4830
4831                    for (int i = 0; i < n_layer; ++i) {
4832                        auto & layer = layers[i];
4833
4834                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_qo_dim}, 0);
4835                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_kv_dim}, 0);
4836                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_kv_dim}, 0);
4837                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
4838
4839                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_qo_dim},   TENSOR_NOT_REQUIRED);
4840                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_kv_dim},   TENSOR_NOT_REQUIRED);
4841                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_kv_dim},   TENSOR_NOT_REQUIRED);
4842
4843                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4844                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4845
4846                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4847                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4848                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4849                    }
4850                } break;
4851
4852            case LLM_ARCH_OLMOE:
4853                {
4854                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4855
4856                    // output
4857                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4858                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4859
4860                    for (int i = 0; i < n_layer; ++i) {
4861                        auto & layer = layers[i];
4862
4863                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4864
4865                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4866                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4867                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4868                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4869                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
4870                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
4871
4872                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4873
4874                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4875
4876                        if (n_expert == 0) {
4877                            throw std::runtime_error("n_expert must be > 0");
4878                        }
4879                        if (n_expert_used == 0) {
4880                            throw std::runtime_error("n_expert_used must be > 0");
4881                        }
4882
4883                        // MoE branch
4884                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
4885                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
4886                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
4887                    }
4888                } break;
4889            case LLM_ARCH_OPENELM:
4890                {
4891                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4892
4893                    // output
4894                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4895                    // init output from the input tok embed
4896                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4897
4898                    for (int i = 0; i < n_layer; ++i) {
4899                        const int64_t n_head      =   hparams.n_head(i);
4900                        const int64_t n_head_qkv  = 2*hparams.n_head_kv(i) + n_head;
4901                        const int64_t n_ff        =   hparams.n_ff(i);
4902
4903                        auto & layer = layers[i];
4904
4905                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4906
4907                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
4908                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4909                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4910                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
4911
4912                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4913                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4914                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4915                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4916                    }
4917                } break;
4918            case LLM_ARCH_GPTNEOX:
4919                {
4920                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4921
4922                    // output
4923                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4924                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4925                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4926
4927                    for (int i = 0; i < n_layer; ++i) {
4928                        auto & layer = layers[i];
4929
4930                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4931                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4932
4933                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4934                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
4935
4936                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4937                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
4938
4939                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4940                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4941
4942                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4943                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
4944
4945                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4946                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
4947                    }
4948                } break;
4949            case LLM_ARCH_ARCTIC:
4950                {
4951                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4952
4953                    // output
4954                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4955                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4956
4957                    // if output is NULL, init from the input tok embed
4958                    if (output == NULL) {
4959                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4960                    }
4961
4962                    for (int i = 0; i < n_layer; ++i) {
4963                        auto & layer = layers[i];
4964
4965                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4966
4967                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4968                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4969                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4970                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4971
4972                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4973
4974                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
4975                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
4976                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_embd}, 0);
4977
4978                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4979                        layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
4980                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);
4981                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
4982                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
4983                    }
4984                } break;
4985            case LLM_ARCH_DEEPSEEK:
4986                {
4987
4988                    const int64_t n_ff_exp        = hparams.n_ff_exp;
4989                    const int64_t n_expert_shared = hparams.n_expert_shared;
4990
4991                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4992
4993                    // output
4994                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4995                    // try to load output.weight, if not found, use token_embd (tied embeddings)
4996                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4997                    if (!output) {
4998                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4999                    }
5000
5001                    for (int i = 0; i < n_layer; ++i) {
5002                        auto & layer = layers[i];
5003
5004                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5005
5006                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5007                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5008                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5009                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5010                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5011
5012                        if (i < (int) hparams.n_layer_dense_lead) {
5013                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5014                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5015                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5016                        } else {
5017                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5018
5019                            if (n_expert == 0) {
5020                                throw std::runtime_error("n_expert must be > 0");
5021                            }
5022                            if (n_expert_used == 0) {
5023                                throw std::runtime_error("n_expert_used must be > 0");
5024                            }
5025
5026                            // MoE branch
5027                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5028                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
5029                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5030
5031                            // Shared expert branch
5032                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5033                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
5034                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5035                        }
5036                    }
5037                } break;
5038            case LLM_ARCH_DEEPSEEK2:
5039                {
5040                    const bool is_mla = hparams.is_mla();
5041
5042                    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
5043                    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
5044                    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
5045
5046                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
5047                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
5048
5049                    const int64_t q_lora_rank  = hparams.n_lora_q;
5050                    const int64_t kv_lora_rank = hparams.n_lora_kv;
5051
5052                    const int64_t n_ff_exp        = hparams.n_ff_exp;
5053                    const int64_t n_expert_shared = hparams.n_expert_shared;
5054
5055                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5056
5057                    // output
5058                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5059                    // try to load output.weight, if not found, use token_embd (tied embeddings)
5060                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5061                    if (!output) {
5062                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5063                    }
5064
5065                    for (int i = 0; i < n_layer; ++i) {
5066                        auto & layer = layers[i];
5067
5068                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5069                        if (q_lora_rank > 0) {
5070                            layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
5071                        }
5072
5073                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
5074
5075                        if (q_lora_rank > 0) {
5076                            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
5077                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
5078                        } else {
5079                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
5080                        }
5081
5082                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
5083
5084                        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
5085                        if (is_mla) {
5086                            layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
5087                            layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
5088                        } else {
5089                            layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
5090                        }
5091
5092                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
5093
5094                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5095
5096                        if (i < (int) hparams.n_layer_dense_lead) {
5097                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5098                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5099                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5100                        } else {
5101                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5102                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5103
5104                            if (n_expert == 0) {
5105                                throw std::runtime_error("n_expert must be > 0");
5106                            }
5107                            if (n_expert_used == 0) {
5108                                throw std::runtime_error("n_expert_used must be > 0");
5109                            }
5110
5111                            // MoE branch
5112                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5113                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
5114                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5115
5116                            // Shared expert branch
5117                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5118                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
5119                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5120                        }
5121                    }
5122                } break;
5123            case LLM_ARCH_PLM:
5124                {
5125                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
5126                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
5127                    const int64_t kv_lora_rank = hparams.n_lora_kv;
5128
5129                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5130
5131                    // output
5132                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5133                    // output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5134                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5135
5136                    for (int i = 0; i < n_layer; ++i) {
5137                        auto & layer = layers[i];
5138
5139                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5140
5141                        layer.wq        = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5142                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
5143                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
5144                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
5145                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
5146
5147                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5148                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5149                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5150                    }
5151                } break;
5152            case LLM_ARCH_BITNET:
5153                {
5154                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5155
5156                    // output
5157                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5158
5159                    for (int i = 0; i < n_layer; ++i) {
5160                        auto & layer = layers[i];
5161
5162                        layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);
5163                        layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
5164
5165                        layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5166                        layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5167                        layer.wk       = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5168                        layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5169                        layer.wv       = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5170                        layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5171                        layer.wo       = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5172                        layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5173
5174                        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd}, 0);
5175                        layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
5176
5177                        layer.ffn_gate       = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5178                        layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5179                        layer.ffn_down       = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5180                        layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5181                        layer.ffn_up         = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
5182                        layer.ffn_up_scale   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5183                    }
5184                } break;
5185            case LLM_ARCH_T5:
5186                {
5187                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
5188
5189                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5190
5191                    // output
5192                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
5193                    output_norm     = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
5194
5195                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5196                    // if output is NULL, init from the input tok embed
5197                    if (output == NULL) {
5198                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5199                    }
5200
5201                    // n_layer:     number of encoder_layers
5202                    // dec_n_layer: number of decoder_layers
5203                    const int dec_n_layer = hparams.dec_n_layer;
5204                    if (dec_n_layer > n_layer) {
5205                        layers.resize(dec_n_layer);
5206                    }
5207
5208                    // load encoder layers
5209                    for (int i = 0; i < n_layer; ++i) {
5210                        auto & layer = layers[i];
5211
5212                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
5213                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5214
5215                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5216                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5217                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5218                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5219
5220                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
5221                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
5222                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5223                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5224                    }
5225
5226                    // load decoder layers
5227                    for (int i = 0; i < dec_n_layer; ++i) {
5228                        auto & layer = layers[i];
5229
5230                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd}, 0);
5231                        layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5232
5233                        layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5234                        layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5235                        layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5236                        layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5237
5238                        layer.attn_norm_cross  = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd}, 0);
5239                        // this tensor seems to be unused in HF transformers implementation
5240                        layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5241
5242                        layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5243                        layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5244                        layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5245                        layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5246
5247                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
5248                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
5249                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5250                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5251                    }
5252                } break;
5253            case LLM_ARCH_T5ENCODER:
5254                {
5255                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
5256
5257                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5258
5259                    // output
5260                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
5261                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5262                    // if output is NULL, init from the input tok embed
5263                    if (output == NULL) {
5264                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5265                    }
5266
5267                    for (int i = 0; i < n_layer; ++i) {
5268                        auto & layer = layers[i];
5269
5270                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
5271                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5272
5273                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5274                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5275                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5276                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5277
5278                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
5279                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
5280                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5281                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5282                    }
5283                } break;
5284            case LLM_ARCH_JAIS:
5285                {
5286                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5287
5288                    // output
5289                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5290                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
5291                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5292
5293                    for (int i = 0; i < n_layer; ++i) {
5294                        auto & layer = layers[i];
5295
5296                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
5297                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
5298
5299                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
5300                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
5301
5302                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5303                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
5304
5305                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5306                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
5307
5308                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5309                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
5310
5311                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd, n_ff}, 0);
5312                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "bias", i),   {n_ff}, 0);
5313
5314                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
5315                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
5316                    }
5317                } break;
5318            case LLM_ARCH_CHATGLM:
5319                {
5320                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
5321
5322                    // output
5323                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5324                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5325                    // if output is NULL, init from the input tok embed
5326                    if (output == NULL) {
5327                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5328                    }
5329
5330                    for (int i = 0; i < n_layer; ++i) {
5331                        auto & layer = layers[i];
5332
5333                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5334                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5335                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5336
5337                        if (layer.wqkv == nullptr) {
5338                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5339                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5340                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5341                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5342                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5343                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5344                        }
5345
5346                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5347
5348                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5349
5350                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
5351
5352                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5353                    }
5354                } break;
5355            case LLM_ARCH_GLM4:
5356                {
5357                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5358
5359                    // output
5360                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5361                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5362                    // if output is NULL, init from the input tok embed
5363                    if (output == NULL) {
5364                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5365                    }
5366
5367                    for (int i = 0; i < n_layer; ++i) {
5368                        auto & layer = layers[i];
5369
5370                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5371                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5372                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5373
5374                        if (layer.wqkv == nullptr) {
5375                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5376                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5377                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5378                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5379                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5380                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5381                        }
5382
5383                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5384
5385                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5386
5387                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5388                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5389                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
5390
5391                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5392                    }
5393                } break;
5394            case LLM_ARCH_GLM4_MOE:
5395                {
5396                    const int64_t n_expert        = hparams.n_expert;
5397                    const int64_t n_expert_used   = hparams.n_expert_used;
5398                    const int64_t n_expert_shared = hparams.n_expert_shared;
5399
5400                    GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
5401                    GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
5402
5403                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
5404
5405                    // output
5406                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5407                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
5408                    // if output is NULL, init from the input tok embed
5409                    if (output == NULL) {
5410                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
5411                    }
5412
5413                    // Load ALL tensors including NextN layer to satisfy total tensor count
5414                    // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
5415                    for (int i = 0; i < n_layer; ++i) {
5416                        int flags = 0;
5417                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5418                            // skip all tensors in the NextN layers
5419                            flags |= TENSOR_SKIP;
5420                        }
5421
5422                        auto & layer = layers[i];
5423
5424                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
5425
5426                        // GLM-style attention with bias terms
5427                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
5428                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
5429                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
5430                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
5431                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
5432                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
5433
5434                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
5435
5436                        // K/Q norm tensors (optional for GLM-4.5 355B variant)
5437                        layer.attn_q_norm = create_tensor(
5438                            tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
5439                        layer.attn_k_norm = create_tensor(
5440                            tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
5441
5442                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
5443
5444                        // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
5445                        // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
5446                        const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
5447
5448                        if (use_moe) {
5449                            // MoE layers
5450                            layer.ffn_gate_inp =
5451                                create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
5452                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
5453
5454                            // MoE branch
5455                            const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5456
5457                            layer.ffn_gate_exps = create_tensor(
5458                                tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
5459                            layer.ffn_down_exps = create_tensor(
5460                                tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
5461                            layer.ffn_up_exps = create_tensor(
5462                                tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
5463
5464                            // Shared expert
5465                            if (n_expert_shared > 0) {
5466                                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
5467                                layer.ffn_gate_shexp = create_tensor(
5468                                    tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
5469                                layer.ffn_down_shexp = create_tensor(
5470                                    tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
5471                                layer.ffn_up_shexp = create_tensor(
5472                                    tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
5473                            }
5474                        } else {
5475                            // Dense layers (first k layers) - GLM uses separate gate/up projections
5476                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
5477                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
5478                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), { n_embd, n_ff }, flags);
5479                        }
5480
5481                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5482                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5483                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5484                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5485                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5486
5487                            // Optional tensors
5488                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5489                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5490                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
5491                        }
5492                    }
5493                }
5494                break;
5495            case LLM_ARCH_NEMOTRON:
5496                {
5497                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5498
5499                    // output
5500                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5501                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5502                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5503
5504                    for (int i = 0; i < n_layer; ++i) {
5505                        auto & layer = layers[i];
5506
5507                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5508                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5509
5510                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5511                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5512                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5513                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5514
5515                        // optional bias tensors
5516                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
5517                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5518                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5519                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
5520
5521                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5522                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
5523
5524                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5525                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5526
5527                        // optional MLP bias
5528                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5529                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
5530                    }
5531                } break;
5532            case LLM_ARCH_NEMOTRON_H:
5533            case LLM_ARCH_NEMOTRON_H_MOE:
5534                {
5535                    // mamba2 Mixer SSM params
5536                    // NOTE: int64_t for tensor dimensions
5537                    const int64_t d_conv     = hparams.ssm_d_conv;
5538                    const int64_t d_inner    = hparams.ssm_d_inner;
5539                    const int64_t d_state    = hparams.ssm_d_state;
5540                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
5541                    const int64_t n_group    = hparams.ssm_n_group;
5542                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5543
5544                    // embeddings
5545                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5546
5547                    // output
5548                    {
5549                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5550                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5551                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
5552                        if (output == NULL) {
5553                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5554                        }
5555                    }
5556
5557                    for (int i = 0; i < n_layer; ++i) {
5558                        auto & layer = layers[i];
5559
5560                        // all blocks use the attn norm
5561                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5562
5563                        if (hparams.is_recurrent(i)) {
5564                            // ssm layers
5565                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
5566
5567                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
5568                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
5569
5570                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
5571
5572                            // no "weight" suffix for these
5573                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
5574                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
5575
5576                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
5577
5578                            // out_proj
5579                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
5580                        } else if (hparams.n_ff(i) == 0) {
5581                            // attention layers (with optional bias)
5582                            const int64_t n_head_i = hparams.n_head(i);
5583                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
5584                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
5585                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
5586                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
5587                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
5588                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
5589                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
5590                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias",   i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
5591                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias",   i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
5592                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
5593                        }  else {
5594                            if (n_expert != 0) {
5595                                const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5596                                const int64_t n_ff_shexp = hparams.n_ff_shexp;
5597
5598                                layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert}, 0);
5599                                layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert         }, 0);
5600
5601                                // MoE branch
5602                                layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
5603                                layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5604
5605                                // Shared expert branch
5606                                layer.ffn_down_shexp  = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
5607                                layer.ffn_up_shexp    = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
5608
5609                            } else {
5610                                // mlp layers
5611                                layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  hparams.n_ff(i), n_embd}, 0);
5612                                layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   hparams.n_ff(i)}, 0);
5613                                layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
5614                                layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias",   i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
5615                            }
5616                        }
5617                    }
5618                } break;
5619            case LLM_ARCH_EXAONE:
5620                {
5621                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5622
5623                    // output
5624                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5625                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5626
5627                    // if output is NULL, init from the input tok embed
5628                    if (output == NULL) {
5629                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5630                    }
5631
5632                    for (int i = 0; i < n_layer; ++i) {
5633                        auto & layer = layers[i];
5634
5635                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5636
5637                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5638                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5639                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5640                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5641
5642                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd}, 0);
5643                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5644                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd,   n_ff}, 0);
5645                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN,   "weight", i), {  n_ff, n_embd}, 0);
5646                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,     "weight", i), {n_embd,   n_ff}, 0);
5647                    }
5648                } break;
5649            case LLM_ARCH_EXAONE4:
5650                {
5651                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5652
5653                    // output
5654                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5655                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5656
5657                    // if output is NULL, init from the input tok embed
5658                    if (output == NULL) {
5659                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5660                    }
5661
5662                    for (int i = 0; i < n_layer; ++i) {
5663                        auto & layer = layers[i];
5664
5665                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5666                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5667                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5668                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5669
5670                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5671
5672                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5673                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5674                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5675
5676                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5677                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5678                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5679                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5680                    }
5681                } break;
5682            case LLM_ARCH_EXAONE_MOE:
5683                {
5684                    const int64_t n_ff_exp       = hparams.n_ff_exp;
5685                    const int64_t n_expert       = hparams.n_expert;
5686                    const int64_t n_expert_used  = hparams.n_expert_used;
5687                    const int64_t n_ff_shexp     = hparams.n_ff_shexp;
5688                    const int64_t head_dim       = hparams.n_embd_head_k;
5689                    const int64_t n_qo_dim       = n_head * head_dim;
5690                    const int64_t n_kv_dim       = n_head_kv * head_dim;
5691
5692                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5693
5694                    // output
5695                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5696                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5697
5698                    if (output == NULL) {
5699                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5700                    }
5701
5702                    for (int i = 0; i < n_layer; ++i) {
5703                        int flags = 0;
5704                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5705                            // skip all tensors in the NextN layers
5706                            flags |= TENSOR_SKIP;
5707                        }
5708
5709                        auto & layer = layers[i];
5710                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_qo_dim}, flags);
5711                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_kv_dim}, flags);
5712                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_kv_dim}, flags);
5713                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
5714
5715                        layer.rope_freqs   = create_tensor(tn(LLM_TENSOR_ROPE_FREQS,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
5716
5717                        layer.attn_norm    = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, flags);
5718                        layer.attn_q_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
5719                        layer.attn_k_norm  = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
5720
5721                        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, flags);
5722
5723                        // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
5724                        if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
5725                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
5726                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
5727                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, flags);
5728                        } else {
5729                            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, flags);
5730                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
5731
5732                            if (n_expert == 0) {
5733                                throw std::runtime_error("n_expert must be > 0");
5734                            }
5735                            if (n_expert_used == 0) {
5736                                throw std::runtime_error("n_expert_used must be > 0");
5737                            }
5738
5739                            layer.ffn_gate_exps  = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS,  "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
5740                            layer.ffn_down_exps  = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS,  "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
5741                            layer.ffn_up_exps    = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,    "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
5742
5743                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5744                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
5745                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, flags);
5746                        }
5747
5748                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5749                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5750                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
5751                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,   "weight", i), {n_embd}, flags);
5752                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,   "weight", i), {n_embd}, flags);
5753
5754                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
5755                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
5756                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
5757                        }
5758                    }
5759                } break;
5760            case LLM_ARCH_RWKV6:
5761                {
5762                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5763
5764                    // Block 0, LN0
5765                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5766                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
5767
5768                    // output
5769                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5770                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5771                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5772
5773                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
5774                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
5775                    const int head_size = hparams.wkv_head_size;
5776                    const int attn_hidden_size = n_embd;
5777                    const int ffn_size = hparams.n_ff_arr[0];
5778
5779                    for (int i = 0; i < n_layer; ++i) {
5780                        auto & layer = layers[i];
5781
5782                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5783                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
5784
5785                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5786                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
5787
5788                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
5789                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
5790
5791                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
5792                        layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5793                        layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5794                        layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5795                        layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5796                        layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5797                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
5798                        GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
5799
5800                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
5801                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
5802                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
5803                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
5804                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5805                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5806                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5807                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
5808
5809                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
5810                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
5811                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5812
5813                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
5814                        layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
5815
5816                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
5817                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
5818                        layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
5819                    }
5820
5821                } break;
5822            case LLM_ARCH_RWKV6QWEN2:
5823                {
5824                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5825
5826                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5827                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
5828                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5829
5830                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
5831                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
5832                    const int head_size = hparams.wkv_head_size;
5833                    const int attn_hidden_size = n_embd;
5834                    const int n_head_kv = hparams.n_head_kv();
5835                    int attn_key_value_size;
5836                    if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
5837                        attn_key_value_size = attn_hidden_size;
5838                    } else {
5839                        attn_key_value_size = n_head_kv * head_size;
5840                    }
5841
5842                    for (int i = 0; i < n_layer; ++i) {
5843                        auto & layer = layers[i];
5844
5845                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5846
5847                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
5848                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
5849
5850                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
5851                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
5852
5853                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
5854                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
5855                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
5856                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
5857                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
5858                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
5859                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5860                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
5861                        // optional bias tensors
5862                        layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
5863                        layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
5864                        layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
5865
5866                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5867
5868                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5869
5870                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5871                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5872                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5873                    }
5874                } break;
5875            case LLM_ARCH_RWKV7:
5876                {
5877                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5878
5879                    // Block 0, LN0
5880                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5881                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
5882
5883                    // output
5884                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5885                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5886                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5887
5888                    const int n_lora_decay = hparams.n_lora_decay;
5889                    const int n_lora_iclr = hparams.n_lora_iclr;
5890                    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
5891                    const int n_lora_gate = hparams.n_lora_gate;
5892                    const int attn_hidden_size = n_embd;
5893                    const int ffn_size = hparams.n_ff_arr[0];
5894
5895                    for (int i = 0; i < n_layer; ++i) {
5896                        auto & layer = layers[i];
5897
5898                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5899                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
5900
5901                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5902                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
5903
5904                        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
5905                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
5906                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
5907
5908                        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
5909                        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
5910                        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
5911
5912                        if (i == 0) {
5913                            // actually not used
5914                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5915                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
5916                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
5917                        } else {
5918                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5919                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
5920                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
5921                        }
5922
5923                        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
5924                        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
5925
5926                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
5927
5928                        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
5929                        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
5930                        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
5931
5932                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5933                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5934                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5935
5936                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
5937                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
5938                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5939
5940                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
5941
5942                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
5943                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
5944                    }
5945
5946                } break;
5947            case LLM_ARCH_ARWKV7:
5948                {
5949                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5950
5951                    // output
5952                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5953                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5954
5955                    const int n_lora_decay = hparams.n_lora_decay;
5956                    const int n_lora_iclr = hparams.n_lora_iclr;
5957                    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
5958                    const int n_lora_gate = hparams.n_lora_gate;
5959                    const int attn_hidden_size = n_embd;
5960
5961                    for (int i = 0; i < n_layer; ++i) {
5962                        auto & layer = layers[i];
5963
5964                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5965
5966                        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
5967                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
5968                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
5969
5970                        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
5971                        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
5972                        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
5973
5974                        if (i == 0) {
5975                            // actually not used
5976                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5977                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
5978                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
5979                        } else {
5980                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5981                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
5982                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
5983                        }
5984
5985                        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
5986                        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
5987
5988                        try {
5989                            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
5990                        } catch(std::runtime_error & e) {
5991                            // ARWKV models may not have gate tensors
5992                            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
5993                        }
5994
5995                        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
5996                        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
5997                        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
5998
5999                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
6000                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
6001                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
6002
6003                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
6004                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
6005                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
6006
6007                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6008
6009                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6010                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6011                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6012                    }
6013
6014                } break;
6015            case LLM_ARCH_CHAMELEON:
6016                {
6017                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6018
6019                    // output
6020                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6021                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6022                    // if output is NULL, init from the input tok embed
6023                    if (output == NULL) {
6024                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6025                    }
6026
6027                    for (int i = 0; i < n_layer; ++i) {
6028                        auto & layer = layers[i];
6029
6030                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6031                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
6032                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
6033                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
6034                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
6035
6036                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
6037                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
6038                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
6039                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
6040
6041                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6042
6043                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6044                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6045                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6046                    }
6047                } break;
6048            case LLM_ARCH_WAVTOKENIZER_DEC:
6049                {
6050                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd, n_vocab}, 0);
6051
6052                    conv1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd, hparams.posnet.n_embd}, 0);
6053                    conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, hparams.posnet.n_embd}, 0);
6054
6055                    // posnet
6056                    {
6057                        const int64_t n_embd = hparams.posnet.n_embd;
6058
6059                        for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
6060                            auto & layer = layers[i].posnet;
6061
6062                            // posnet:
6063                            //
6064                            //  - resnet
6065                            //  - resnet
6066                            //  - attn
6067                            //  - resnet
6068                            //  - resnet
6069                            //  - norm
6070                            //
6071                            switch (i) {
6072                                case 0:
6073                                case 1:
6074                                case 3:
6075                                case 4:
6076                                    {
6077                                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
6078                                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);
6079
6080                                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
6081                                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);
6082
6083                                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
6084                                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);
6085
6086                                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
6087                                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);
6088                                    } break;
6089                                case 2:
6090                                    {
6091                                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
6092                                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
6093
6094                                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);
6095                                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);
6096
6097                                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);
6098                                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);
6099
6100                                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);
6101                                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);
6102
6103                                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);
6104                                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);
6105                                    } break;
6106                                case 5:
6107                                    {
6108                                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
6109                                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
6110                                    } break;
6111                                default: GGML_ABORT("unknown posnet layer");
6112                            };
6113                        }
6114                    }
6115
6116                    GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
6117
6118                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
6119                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {hparams.posnet.n_embd}, 0);
6120
6121                    // convnext
6122                    {
6123                        const int64_t n_embd = hparams.convnext.n_embd;
6124
6125                        for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
6126                            auto & layer = layers[i].convnext;
6127
6128                            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);
6129                            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);
6130
6131                            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);
6132                            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);
6133
6134                            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);
6135                            layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);
6136
6137                            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);
6138                            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);
6139
6140                            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
6141                        }
6142
6143                        // output
6144                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6145                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
6146                    }
6147
6148                    output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, hparams.n_embd_out()}, 0);
6149                    output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {hparams.n_embd_out()}, 0);
6150                } break;
6151            case LLM_ARCH_BAILINGMOE:
6152                {
6153                    const int64_t n_ff_exp            = hparams.n_ff_exp;
6154                    const int64_t n_expert_shared     = hparams.n_expert_shared;
6155
6156                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6157
6158                    // output
6159                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6160                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6161
6162                    for (int i = 0; i < n_layer; ++i) {
6163                        auto & layer = layers[i];
6164
6165                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6166
6167                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_head * n_rot}, 0);
6168                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6169                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6170                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
6171                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6172
6173                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6174
6175                        if (n_expert == 0) {
6176                            throw std::runtime_error("n_expert must be > 0");
6177                        }
6178                        if (n_expert_used == 0) {
6179                            throw std::runtime_error("n_expert_used must be > 0");
6180                        }
6181
6182                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6183                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
6184                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6185
6186                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6187                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
6188                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6189                    }
6190                } break;
6191            case LLM_ARCH_BAILINGMOE2:
6192                {
6193                    const int64_t n_ff_exp        = hparams.n_ff_exp;
6194                    const int64_t n_expert_shared = hparams.n_expert_shared;
6195
6196                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6197
6198                    // output
6199                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6200                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6201
6202                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
6203                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
6204
6205                    for (int i = 0; i < n_layer; ++i) {
6206                        int flags = 0;
6207                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
6208                            // skip all tensors in the NextN layers
6209                            flags |= TENSOR_SKIP;
6210                        }
6211
6212                        auto & layer = layers[i];
6213
6214                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
6215
6216                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
6217                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
6218
6219                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
6220                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
6221
6222                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
6223
6224                        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
6225                            const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
6226
6227                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
6228                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
6229
6230                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
6231                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
6232                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
6233
6234                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
6235                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
6236                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, flags);
6237                        } else { // Dense layers
6238                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
6239                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
6240                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
6241                        }
6242
6243                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
6244                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
6245                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
6246                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
6247                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
6248                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
6249                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
6250                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
6251                            layer.layer_out_norm         = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
6252                        }
6253                    }
6254                } break;
6255            case LLM_ARCH_DOTS1:
6256                {
6257                    const int64_t n_ff_exp        = hparams.n_ff_exp;
6258                    const int64_t n_expert_shared = hparams.n_expert_shared;
6259
6260                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6261
6262                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6263                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6264
6265                    for (int i = 0; i < n_layer; ++i) {
6266                        auto & layer = layers[i];
6267
6268                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6269
6270                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6271                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6272                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6273                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6274
6275                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6276                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6277
6278                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6279
6280                        if (i < (int) hparams.n_layer_dense_lead) {
6281                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6282                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6283                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6284                        } else {
6285                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6286                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
6287
6288                            if (n_expert == 0) {
6289                                throw std::runtime_error("n_expert must be > 0");
6290                            }
6291                            if (n_expert_used == 0) {
6292                                throw std::runtime_error("n_expert_used must be > 0");
6293                            }
6294
6295                            // MoE branch
6296                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6297                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
6298                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6299
6300                            // Shared expert branch
6301                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6302                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
6303                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6304                        }
6305                    }
6306                } break;
6307            case LLM_ARCH_ARCEE:
6308                {
6309                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6310
6311                    // output
6312                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6313                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6314
6315                    // if output is NULL, init from the input tok embed
6316                    if (output == NULL) {
6317                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6318                    }
6319
6320                    for (int i = 0; i < n_layer; ++i) {
6321                        auto & layer = layers[i];
6322
6323                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6324
6325                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6326                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6327                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6328                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6329
6330                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6331
6332                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6333
6334                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6335                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6336                    }
6337                } break;
6338            case LLM_ARCH_AFMOE:
6339                {
6340                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6341
6342                    // output
6343                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6344                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6345
6346                    // if output is NULL, init from the input tok embed
6347                    if (output == NULL) {
6348                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6349                    }
6350
6351                    const int64_t n_ff_exp = hparams.n_ff_exp;
6352                    const int64_t n_expert_shared = hparams.n_expert_shared;
6353
6354                    for (int i = 0; i < n_layer; ++i) {
6355                        auto & layer = layers[i];
6356
6357                        // dual attention normalization
6358                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
6359                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
6360
6361                        // attention projections
6362                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6363                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6364                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6365                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6366
6367                        // Q/K normalization
6368                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6369                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6370
6371                        // attention gating
6372                        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6373
6374                        // dual ffn normalization
6375                        layer.ffn_norm      = create_tensor(tn(LLM_TENSOR_FFN_NORM,      "weight", i), {n_embd}, 0);
6376                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
6377
6378                        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
6379                            // MoE layers
6380                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6381                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6382
6383                            // grouped expert weights
6384                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
6385                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
6386                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
6387
6388                            // shared expert
6389                            if (n_expert_shared > 0) {
6390                                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
6391                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
6392                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
6393                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
6394                            }
6395                        } else {
6396                            // Dense layers
6397                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6398                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
6399                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
6400                        }
6401                    }
6402                } break;
6403            case LLM_ARCH_ERNIE4_5:
6404            case LLM_ARCH_ERNIE4_5_MOE:
6405                {
6406                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6407
6408                    // output
6409                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6410                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6411                    // if output is NULL, init from the input tok embed
6412                    if (output == NULL) {
6413                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6414                    }
6415
6416                    for (int i = 0; i < n_layer; ++i) {
6417                        auto & layer = layers[i];
6418
6419                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6420
6421                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6422                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
6423                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
6424                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6425
6426                        // optional bias tensors
6427                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
6428                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
6429                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
6430                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
6431
6432                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6433
6434                        if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
6435                            int n_ff_exp = hparams.n_ff_exp;
6436
6437                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
6438                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
6439                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
6440                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
6441                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
6442
6443                            // Shared expert (if present)
6444                            if (hparams.n_ff_shexp > 0) {
6445                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
6446                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd    }, 0);
6447                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
6448                            }
6449                        } else { // Dense layers
6450                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6451                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6452                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6453                        }
6454                    }
6455                } break;
6456            case LLM_ARCH_FALCON_H1:
6457                {
6458                    // Common
6459                    const int64_t hidden_size = hparams.n_embd; // hidden_size
6460
6461                    // mamba2 Mixer SSM params
6462                    const int64_t ssm_conv_kernel_size  = hparams.ssm_d_conv; // ssm_conv_kernel_size
6463                    const int64_t ssm_n_groups          = hparams.ssm_n_group; // ssm_n_groups
6464                    const int64_t ssm_state_size        = hparams.ssm_d_state; // ssm_state_size
6465                    const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
6466                    const int64_t ssm_num_heads         = hparams.ssm_dt_rank; // ssm_num_heads
6467                    const int64_t ssm_conv_dim          = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
6468                    const int64_t ssm_projection_size   = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
6469
6470                    // attn params
6471                    const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
6472                    const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
6473
6474                    // ffn params
6475                    const int64_t ffn_intermediate_size = hparams.n_ff(0);
6476
6477                    // embeddings
6478                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
6479
6480                    // output
6481                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
6482                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
6483
6484                    // if output is NULL, init from the input tok embed
6485                    if (output == NULL) {
6486                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
6487                    }
6488
6489                    for (int i = 0; i < n_layer; ++i) {
6490                        auto & layer = layers[i];
6491
6492                        /*SSM LAYERS*/
6493                        // ssm in
6494                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
6495                        // ssm 1d conv
6496                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
6497                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
6498                        // ssm_dt
6499                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
6500                        // no "weight" suffix for these
6501                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
6502                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
6503                        // ssm_norm
6504                        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
6505                        // out_proj
6506                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
6507
6508                        /*ATTENTION LAYERS*/
6509                        // attention layers (with optional bias)
6510                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
6511                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
6512                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
6513                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
6514                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
6515                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
6516                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
6517                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
6518                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
6519
6520
6521                        // feed forward (w/ optional biases)
6522                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
6523                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6524                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
6525                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  ffn_intermediate_size, hidden_size}, 0);
6526                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
6527
6528                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
6529                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
6530                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
6531                    }
6532                } break;
6533            case LLM_ARCH_HUNYUAN_MOE:
6534                {
6535                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6536
6537                    // output
6538                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6539                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6540                    // if output is NULL, init from the input tok embed
6541                    if (output == NULL) {
6542                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6543                    }
6544
6545                    for (int i = 0; i < n_layer; ++i) {
6546                        auto & layer = layers[i];
6547
6548                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6549
6550                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6551                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6552                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6553                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6554
6555                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6556                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6557
6558                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6559
6560                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
6561                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, 0);
6562                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
6563                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
6564
6565                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
6566                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
6567                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
6568                    }
6569                } break;
6570            case LLM_ARCH_HUNYUAN_DENSE:
6571                {
6572                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6573
6574                    // output
6575                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6576                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6577                    // if output is NULL, init from the input tok embed
6578                    if (output == NULL) {
6579                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6580                    }
6581
6582                    for (int i = 0; i < n_layer; ++i) {
6583                        auto & layer = layers[i];
6584
6585                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6586
6587                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6588                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6589                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6590                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6591
6592                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6593                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6594
6595                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6596
6597                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6598                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6599                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6600
6601                    }
6602                } break;
6603            case LLM_ARCH_SMOLLM3:
6604                {
6605                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6606
6607                    // output
6608                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6609                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6610
6611                    // if output is NULL, init from the input tok embed
6612                    if (output == NULL) {
6613                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6614                    }
6615
6616                    for (int i = 0; i < n_layer; ++i) {
6617                        auto & layer = layers[i];
6618
6619                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6620
6621                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6622                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6623                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6624                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6625
6626                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6627                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6628                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6629                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6630                    }
6631                } break;
6632            case LLM_ARCH_OPENAI_MOE:
6633                {
6634                    const int64_t n_ff_exp = hparams.n_ff_exp;
6635
6636                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6637
6638                    // output
6639                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6640                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6641
6642                    for (int i = 0; i < n_layer; ++i) {
6643                        auto & layer = layers[i];
6644
6645                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
6646                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
6647
6648                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_head * n_rot}, 0);
6649                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6650                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6651                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
6652
6653                        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
6654
6655                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {  n_embd, n_expert}, 0);
6656                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6657                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
6658                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6659
6660                        // bias
6661                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_head * n_rot}, 0);
6662                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_head_kv * n_rot}, 0);
6663                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_head_kv * n_rot}, 0);
6664                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
6665
6666                        layer.ffn_gate_inp_b  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "bias", i), {n_expert}, 0);
6667                        layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
6668                        layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), {  n_embd, n_expert}, 0);
6669                        layer.ffn_up_exps_b   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert}, 0);
6670                    }
6671                } break;
6672            case LLM_ARCH_LFM2:
6673            case LLM_ARCH_LFM2MOE:
6674                {
6675                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6676
6677                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
6678                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,           "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6679
6680                    if (output == NULL) {
6681                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6682                    }
6683
6684                    for (int i = 0; i < n_layer; ++i) {
6685                        auto & layer = layers[i];
6686
6687                        const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
6688
6689                        // ffn/moe is same for transformer and conv layers
6690                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6691                        if (is_moe_layer) {
6692                            GGML_ASSERT(n_expert && n_expert_used);
6693                            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),  {n_embd, n_expert}, 0);
6694                            layer.ffn_gate_exps   = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
6695                            layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp,   n_embd, n_expert}, 0);
6696                            layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i),   {n_embd, hparams.n_ff_exp, n_expert}, 0);
6697                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6698                        } else {  // dense
6699                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6700                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6701                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6702                        }
6703
6704                        // for operator_norm
6705                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6706
6707                        if (!hparams.is_recurrent(i)) {
6708                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6709                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6710                            GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
6711
6712                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
6713                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
6714                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
6715
6716                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
6717                        } else {
6718                            layer.shortconv.conv     = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV,    "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
6719                            layer.shortconv.in_proj  = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ,  "weight", i), {n_embd, 3 * n_embd}, 0);
6720                            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
6721                        }
6722                    }
6723
6724                    // for LFM2-ColBert-350M
6725                    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
6726                } break;
6727            case LLM_ARCH_SMALLTHINKER:
6728                {
6729                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6730
6731                    // output
6732                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6733                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6734
6735                    // if output is NULL, init from the input tok embed
6736                    if (output == NULL) {
6737                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6738                    }
6739
6740                    for (int i = 0; i < n_layer; ++i) {
6741                        auto & layer = layers[i];
6742
6743                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6744
6745                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6746                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6747                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6748                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6749
6750                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6751
6752                        GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
6753                        GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
6754
6755                        // MoE branch
6756                        const int64_t n_ff_exp = hparams.n_ff_exp;
6757                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
6758                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6759                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
6760                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6761                    }
6762                } break;
6763            case LLM_ARCH_GROVEMOE:
6764                {
6765                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6766
6767                    // output
6768                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6769                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6770                    // if output is NULL, init from the input tok embed
6771                    if (output == NULL) {
6772                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6773                    }
6774
6775                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
6776                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
6777                    GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
6778
6779                    for (int i = 0; i < n_layer; ++i) {
6780                        auto & layer = layers[i];
6781
6782                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6783
6784                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6785                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
6786                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
6787                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6788
6789                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6790                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6791
6792                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6793
6794                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6795
6796                        // MoE branch
6797                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
6798                        const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
6799                        const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
6800
6801                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6802                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
6803                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6804
6805                        layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
6806                        layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp,   n_embd, n_chunk_expert}, 0);
6807                        layer.ffn_up_chexps   = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS,   "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
6808                    }
6809                } break;
6810            case LLM_ARCH_APERTUS:
6811                {
6812                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6813
6814                    // output
6815                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6816                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
6817
6818                    for (int i = 0; i < n_layer; ++i) {
6819                        auto & layer = layers[i];
6820
6821                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6822
6823                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
6824                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6825                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6826                        } else {
6827                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6828                        }
6829
6830                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6831                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_gqa }, 0);
6832                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_gqa }, 0);
6833                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6834
6835                        // optional bias tensors
6836                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), { n_embd },     TENSOR_NOT_REQUIRED);
6837                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6838                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6839                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd },     TENSOR_NOT_REQUIRED);
6840
6841                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6842                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
6843                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
6844
6845                        // Q and K layernorms for Apertus
6846                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6847                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6848                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6849                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6850                    }
6851                } break;
6852            case LLM_ARCH_MINIMAX_M2:
6853                {
6854                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6855
6856                    // output
6857                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6858                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6859
6860                    for (int i = 0; i < n_layer; ++i) {
6861                        auto & layer = layers[i];
6862
6863                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6864                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6865                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6866                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6867
6868                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6869                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
6870                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
6871
6872                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6873
6874                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6875                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
6876                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
6877                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
6878                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6879                    }
6880                } break;
6881            case LLM_ARCH_KIMI_LINEAR:
6882                {
6883                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6884
6885                    // output
6886                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6887                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6888
6889                    for (int i = 0; i < n_layer; ++i) {
6890                        auto & layer = layers[i];
6891
6892                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6893
6894                        // Check for KDA specific tensors to determine layer type or if it's a mixed model
6895                        // Assuming KDA layer if KDA tensors are present
6896
6897                        // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
6898                        const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda;
6899                        const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
6900                        const int64_t ssm_d_conv = hparams.ssm_d_conv;
6901
6902                        // Try loading KDA specific tensors (using SSM_ prefix)
6903                        // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
6904                        // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
6905                        layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
6906                        if (!layer.ssm_q_conv) {
6907                            layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
6908                        }
6909
6910                        if (layer.ssm_q_conv) {
6911                             // KDA Layer - Conv1d weights may be 3D or 4D
6912                             layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
6913                             if (!layer.ssm_k_conv) {
6914                                 layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
6915                             }
6916                             layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
6917                             if (!layer.ssm_v_conv) {
6918                                 layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0);
6919                             }
6920
6921                             // q, k, v projections
6922                             // Python: q_proj, k_proj, v_proj
6923                             layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
6924                             layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
6925                             layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_v_kda * n_head}, 0);
6926
6927                             // KDA specific projections
6928                             // f_a_proj, f_b_proj
6929                             layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim
6930                             layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size
6931
6932                             // b_proj (beta mixing coefficient)
6933                             layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0);
6934
6935                             // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py
6936                             layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
6937                             if (!layer.ssm_a) {
6938                                 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
6939                             }
6940
6941                             // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096]
6942                             layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0);
6943
6944                             // g_a_proj, g_b_proj (output gate)
6945                             layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0);
6946                             layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0);
6947
6948                             // o_norm (reusing SSM_NORM)
6949                             layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated
6950
6951                             // o_proj
6952                             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0);
6953
6954                        } else {
6955                             // MLA Layer - use MLA-specific head dimensions
6956                             const int64_t q_lora_rank  = hparams.n_lora_q;
6957                             const int64_t kv_lora_rank = hparams.n_lora_kv;
6958                             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
6959                             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
6960
6961                             layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
6962                             layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
6963
6964                             if (layer.attn_q_a_norm) {
6965                                 layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
6966                                 layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
6967                             } else {
6968                                 // Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla]
6969                                 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
6970                             }
6971
6972                             // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
6973                             // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
6974                             const int64_t qk_rope_head_dim = hparams.n_rot;  // From config: qk_rope_head_dim
6975                             layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
6976                             // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
6977                             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED);
6978                             if (!layer.wkv_b) { // MLA KV cache enabled
6979                                 layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
6980                                 layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
6981                             }
6982                             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
6983                        }
6984
6985                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6986
6987                        // MoE intermediate size (different from dense FFN)
6988                        const int64_t n_ff_exp = hparams.n_ff_exp;
6989
6990                        // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
6991                        // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
6992                        if (i < (int) hparams.n_layer_dense_lead) {
6993                            // Dense FFN layer - use normal n_ff
6994                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6995                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
6996                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
6997                        } else {
6998                            // MoE layer - use n_ff_exp (1024) instead of n_ff (9216)
6999                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
7000                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
7001                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
7002                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
7003
7004                            // Shared experts use moe_intermediate_size * num_shared_experts
7005                            // Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024
7006                            // Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd]
7007                            const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1);
7008                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
7009                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
7010                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
7011
7012                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
7013                        }
7014                    }
7015                } break;
7016            case LLM_ARCH_COGVLM:
7017                {
7018                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7019
7020                    // output
7021                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7022                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7023
7024                    // if output is NULL, init from the input tok embed
7025                    if (output == NULL) {
7026                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7027                    }
7028
7029                    for (int i = 0; i < n_layer; ++i) {
7030                        auto & layer = layers[i];
7031
7032                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7033                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
7034                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7035
7036                        layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
7037                        layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7038
7039                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7040
7041                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7042                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7043                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7044                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7045
7046                        layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7047                        layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7048                        layer.visexp_ffn_up   = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7049                    }
7050                } break;
7051            case LLM_ARCH_PANGU_EMBED:
7052                {
7053                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7054
7055                    // output
7056                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7057                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7058
7059                    // if output is NULL, init from the input tok embed
7060                    if (output == NULL) {
7061                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7062                    }
7063
7064                    for (int i = 0; i < n_layer; ++i) {
7065                        auto & layer = layers[i];
7066
7067                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7068
7069                        // weight tensors
7070                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
7071                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
7072                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
7073                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7074
7075                        // bias tensors
7076                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd_head_k * n_head}, 0);
7077                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
7078                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
7079                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
7080
7081                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7082
7083                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7084                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7085                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7086                        } else {
7087                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7088                        }
7089
7090                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7091                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7092                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7093                    }
7094                } break;
7095            case LLM_ARCH_QWEN3NEXT:
7096                {
7097                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7098
7099                    // output
7100                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7101                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7102
7103                    // if output is NULL, init from the input tok embed
7104                    if (output == NULL) {
7105                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7106                    }
7107
7108                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
7109
7110                    // Calculate dimensions from hyperparameters
7111                    const int64_t head_k_dim = hparams.ssm_d_state;
7112                    const int64_t head_v_dim = hparams.ssm_d_state;
7113                    const int64_t n_k_heads  = hparams.ssm_n_group;
7114                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
7115                    const int64_t key_dim    = head_k_dim * n_k_heads;
7116                    const int64_t value_dim  = head_v_dim * n_v_heads;
7117                    const int64_t conv_dim   = key_dim * 2 + value_dim;
7118
7119                    // Calculate projection sizes
7120                    const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
7121                    const int64_t ba_dim   = n_v_heads * 2;
7122
7123                    for (int i = 0; i < n_layer; ++i) {
7124                        auto & layer = layers[i];
7125
7126                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
7127                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7128
7129                        if (!hparams.is_recurrent(i)) {
7130                            // Attention layers
7131                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7132                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
7133                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
7134                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7135
7136                            // Q/K normalization for attention layers
7137                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7138                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7139                        } else {
7140                            // Linear attention (gated delta net) specific tensors
7141                            // Create tensors with calculated dimensions
7142                            // note: ssm_in is used by legacy GGUF
7143                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
7144                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7145                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7146                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7147                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
7148                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
7149                            layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
7150                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
7151                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
7152                        }
7153
7154                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
7155                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7156                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
7157                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7158
7159                        // Shared experts
7160                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
7161                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
7162                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
7163                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
7164                    }
7165                } break;
7166            case LLM_ARCH_QWEN35MOE:
7167                {
7168                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7169
7170                    // output
7171                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7172                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7173
7174                    // if output is NULL, init from the input tok embed
7175                    if (output == NULL) {
7176                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7177                    }
7178
7179                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
7180
7181                    // Calculate dimensions from hyperparameters
7182                    const int64_t head_k_dim = hparams.ssm_d_state;
7183                    const int64_t head_v_dim = hparams.ssm_d_state;
7184                    const int64_t n_k_heads  = hparams.ssm_n_group;
7185                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
7186                    const int64_t key_dim    = head_k_dim * n_k_heads;
7187                    const int64_t value_dim  = head_v_dim * n_v_heads;
7188                    const int64_t conv_dim   = key_dim * 2 + value_dim;
7189
7190                    for (int i = 0; i < n_layer; ++i) {
7191                        auto & layer = layers[i];
7192
7193                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
7194                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7195
7196                        if (!hparams.is_recurrent(i)) {
7197                            // Attention layers
7198                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7199                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
7200                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
7201                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7202
7203                            // Q/K normalization for attention layers
7204                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7205                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7206                        } else {
7207                            // Linear attention (gated delta net) specific tensors
7208                            // Create tensors with calculated dimensions
7209                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7210                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7211                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7212                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
7213                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
7214                            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
7215                            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
7216                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
7217                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
7218                        }
7219
7220                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
7221                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7222                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
7223                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7224
7225                        // Shared experts
7226                        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
7227
7228                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
7229                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, n_ff_shexp }, 0);
7230                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, n_ff_shexp }, 0);
7231                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { n_ff_shexp, n_embd }, 0);
7232                    }
7233                } break;
7234            case LLM_ARCH_QWEN35:
7235                {
7236                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7237
7238                    // output
7239                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7240                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7241
7242                    // if output is NULL, init from the input tok embed
7243                    if (output == NULL) {
7244                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7245                    }
7246
7247                    // Calculate dimensions from hyperparameters
7248                    const int64_t head_k_dim = hparams.ssm_d_state;
7249                    const int64_t head_v_dim = hparams.ssm_d_state;
7250                    const int64_t n_k_heads  = hparams.ssm_n_group;
7251                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
7252                    const int64_t key_dim    = head_k_dim * n_k_heads;
7253                    const int64_t value_dim  = head_v_dim * n_v_heads;
7254                    const int64_t conv_dim   = key_dim * 2 + value_dim;
7255
7256                    for (int i = 0; i < n_layer; ++i) {
7257                        auto & layer = layers[i];
7258
7259                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
7260                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7261
7262                        if (!hparams.is_recurrent(i)) {
7263                            // Attention layers
7264                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7265                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
7266                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
7267                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7268
7269                            // Q/K normalization for attention layers
7270                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7271                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7272                        } else {
7273                            // Linear attention (gated delta net) specific tensors
7274                            // Create tensors with calculated dimensions
7275                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7276                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7277                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7278                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
7279                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
7280                            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
7281                            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
7282                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
7283                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
7284                        }
7285
7286                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7287                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7288                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7289                    }
7290                } break;
7291            case LLM_ARCH_MIMO2:
7292                {
7293                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7294
7295                    // output
7296                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7297                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
7298
7299                    for (int i = 0; i < n_layer; ++i) {
7300                        auto & layer = layers[i];
7301                        uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
7302                        uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
7303                        uint32_t n_head = hparams.n_head(i);
7304
7305                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
7306                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
7307                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
7308                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
7309
7310                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM,  "weight", i), {n_embd}, 0);
7311                        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
7312
7313                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7314
7315                        // non-MoE branch
7316                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
7317                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, TENSOR_NOT_REQUIRED);
7318                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
7319
7320                        // MoE branch
7321                        int64_t n_ff_exp = hparams.n_ff_exp;
7322                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7323                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
7324                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7325                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
7326                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
7327                    }
7328                } break;
7329            case LLM_ARCH_STEP35:
7330                {
7331                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7332
7333                    // output
7334                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7335                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
7336
7337                    // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
7338                    // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
7339                    uint32_t n_rot_max = 0;
7340                    for (int i = 0; i < n_layer; ++i) {
7341                        n_rot_max = std::max(n_rot_max, hparams.n_rot);
7342                    }
7343                    if (n_rot_max == 0) {
7344                        n_rot_max = n_rot;
7345                    }
7346
7347                    for (int i = 0; i < n_layer; ++i) {
7348                        auto & layer = layers[i];
7349
7350                        const uint32_t n_head_l      = hparams.n_head(i);
7351                        const uint32_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
7352                        const uint32_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
7353
7354                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7355                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
7356                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
7357
7358                        // optional rope factors (llama3) / longrope tensors
7359                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7360                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7361                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7362                        } else {
7363                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7364                        }
7365
7366                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0);
7367                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
7368                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
7369                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
7370
7371                        // head-wise attention gate (Step35 self_attn.g_proj)
7372                        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
7373
7374                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7375
7376                        // dense MLP (leading dense blocks)
7377                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
7378                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, TENSOR_NOT_REQUIRED);
7379                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
7380
7381                        // MoE routed experts + selection bias (router_bias)
7382                        const int64_t n_ff_exp = hparams.n_ff_exp;
7383                        layer.ffn_gate_inp      = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7384                        layer.ffn_gate_exps     = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
7385                        layer.ffn_down_exps     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7386                        layer.ffn_up_exps       = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
7387                        layer.ffn_exp_probs_b   = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
7388
7389                        // shared expert MLP
7390                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
7391                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
7392                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
7393                    }
7394                } break;
7395            case LLM_ARCH_MAINCODER:
7396                {
7397                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7398
7399                    // output
7400                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7401                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7402                    // if output is NULL, init from the input tok embed
7403                    if (output == NULL) {
7404                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7405                    }
7406
7407                    for (int i = 0; i < n_layer; ++i) {
7408                        auto & layer = layers[i];
7409
7410                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7411
7412                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
7413                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
7414                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
7415                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7416
7417                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
7418                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
7419
7420                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7421                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7422                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7423                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7424                    }
7425                } break;
7426            default:
7427                throw std::runtime_error("unknown architecture");
7428        }
7429
7430        if (n_moved_tensors > 0) {
7431            LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
7432                __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
7433                ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
7434        }
7435    }
7436
7437    ml.done_getting_tensors();
7438
7439    ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
7440    pimpl->mappings.reserve(ml.mappings.size());
7441
7442    // create the backend buffers
7443    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
7444    ctx_buf_maps.reserve(ctx_map.size());
7445
7446    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
7447    const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
7448    pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
7449
7450    for (auto & [buft, ctx_ptr] : ctx_map) {
7451        ggml_context * ctx = ctx_ptr.get();
7452
7453        // skip contexts without tensors
7454        if (ggml_get_first_tensor(ctx) == nullptr) {
7455            continue;
7456        }
7457
7458        llama_buf_map buf_map;
7459        buf_map.reserve(n_max_backend_buffer);
7460
7461        // check if it is possible to use buffer_from_host_ptr with this buffer type
7462        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
7463        if (!dev) {
7464            // FIXME: workaround for CPU backend buft having a NULL device
7465            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
7466            if (!dev) {
7467                throw std::runtime_error(format("%s: no CPU backend found", __func__));
7468            }
7469        }
7470        ggml_backend_dev_props props;
7471        ggml_backend_dev_get_props(dev, &props);
7472        bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
7473        bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
7474
7475        std::vector<ggml_backend_buffer_ptr> bufs;
7476        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
7477            GGML_ASSERT(!ml.no_alloc);
7478            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
7479                // only the mmap region containing the tensors in the model is mapped to the backend buffer
7480                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
7481                //     then we could just use metal for all layers
7482                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
7483                void * addr = nullptr;
7484                size_t first, last; // NOLINT
7485                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
7486                if (first >= last) {
7487                    continue;
7488                }
7489                const size_t max_size = ggml_get_max_tensor_size(ctx);
7490                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
7491                if (buf == nullptr) {
7492                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
7493                }
7494                bufs.emplace_back(buf);
7495                buf_map.emplace(idx, buf);
7496            }
7497        } else {
7498            ggml_backend_buffer_t buf;
7499            if (ml.no_alloc) {
7500                buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
7501                for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
7502                    t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
7503                }
7504            } else {
7505                buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
7506            }
7507            if (buf == nullptr) {
7508                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
7509            }
7510            if (use_mlock && ggml_backend_buffer_is_host(buf)) {
7511                pimpl->mlock_bufs.emplace_back(new llama_mlock);
7512                auto & mlock_buf = pimpl->mlock_bufs.back();
7513                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
7514                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
7515            }
7516            bufs.emplace_back(buf);
7517            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
7518                buf_map.emplace(idx, buf);
7519            }
7520        }
7521        pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
7522
7523        for (auto & buf : buf_map) {
7524            // indicate that this buffer contains weights
7525            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
7526            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
7527        }
7528
7529        ctx_buf_maps.emplace_back(ctx, buf_map);
7530    }
7531
7532    if (llama_supports_gpu_offload()) {
7533        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
7534
7535        int n_repeating = n_gpu;
7536        if (n_repeating > 0) {
7537            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
7538            n_repeating--;
7539        }
7540        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
7541
7542        const int max_backend_supported_layers = hparams.n_layer + 1;
7543        const int max_offloadable_layers       = hparams.n_layer + 1;
7544
7545        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
7546    }
7547
7548    // print memory requirements per buffer type
7549    for (auto & [_, bufs] : pimpl->ctxs_bufs) {
7550        for (auto & buf: bufs) {
7551            LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
7552                __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
7553        }
7554    }
7555
7556    // populate tensors_by_name
7557    for (auto & [ctx, _] : pimpl->ctxs_bufs) {
7558        for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
7559            tensors_by_name.emplace_back(ggml_get_name(cur), cur);
7560        }
7561    }
7562
7563    if (ml.no_alloc) {
7564        return true;
7565    }
7566
7567    // load tensor data
7568    for (auto & [ctx, buf_map] : ctx_buf_maps) {
7569        if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
7570            return false;
7571        }
7572    }
7573
7574    if (use_mmap_buffer) {
7575        for (auto & mapping : ml.mappings) {
7576            pimpl->mappings.emplace_back(std::move(mapping));
7577        }
7578    }
7579
7580    return true;
7581}
7582
7583std::string llama_model::arch_name() const {
7584    return llm_arch_name(arch);
7585}
7586
7587std::string llama_model::type_name() const {
7588    return llm_type_name(type);
7589}
7590
7591std::string llama_model::desc() const {
7592    return pimpl->desc_str;
7593}
7594
7595size_t llama_model::size() const {
7596    return pimpl->n_bytes;
7597}
7598
7599size_t llama_model::n_tensors() const {
7600    return tensors_by_name.size();
7601}
7602
7603size_t llama_model::n_devices() const {
7604    return devices.size();
7605}
7606
7607uint32_t llama_model::n_gpu_layers() const {
7608    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
7609}
7610
7611llama_split_mode llama_model::split_mode() const {
7612    return params.split_mode;
7613}
7614
7615std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
7616    std::map<ggml_backend_buffer_type_t, size_t> ret;
7617    for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
7618        if (hparams.no_alloc) {
7619            GGML_ASSERT(bufs.size() == 1);
7620            ggml_backend_buffer_t buf = bufs[0].get();
7621            GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
7622            ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
7623            ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
7624        } else {
7625            for (const auto & buf : bufs) {
7626                // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
7627                ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
7628            }
7629        }
7630    }
7631    return ret;
7632}
7633
7634uint64_t llama_model::n_elements() const {
7635    return pimpl->n_elements;
7636}
7637
7638void llama_model::print_info() const {
7639    const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
7640
7641    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
7642        bool is_var = false;
7643
7644        std::vector<uint32_t> v;
7645        for (uint32_t i = 0; i < n; ++i) {
7646            v.push_back(f(i));
7647            if (v[i] != v[0]) {
7648                is_var = true;
7649            }
7650        }
7651
7652        std::stringstream ss;
7653
7654        if (is_var) {
7655            ss << "[";
7656            for (uint32_t i = 0; i < n; ++i) {
7657                ss << v[i];
7658                if (i < n - 1) {
7659                    ss << ", ";
7660                }
7661            }
7662            ss << "]";
7663        } else {
7664            ss << v[0];
7665        }
7666
7667        return ss.str();
7668    };
7669
7670    // hparams
7671    LLAMA_LOG_INFO("%s: arch                  = %s\n",     __func__, arch_name().c_str());
7672    LLAMA_LOG_INFO("%s: vocab_only            = %d\n",     __func__, hparams.vocab_only);
7673    LLAMA_LOG_INFO("%s: no_alloc              = %d\n",     __func__, hparams.no_alloc);
7674
7675    if (!hparams.vocab_only) {
7676        LLAMA_LOG_INFO("%s: n_ctx_train           = %u\n",     __func__, hparams.n_ctx_train);
7677        LLAMA_LOG_INFO("%s: n_embd                = %u\n",     __func__, hparams.n_embd);
7678        LLAMA_LOG_INFO("%s: n_embd_inp            = %u\n",     __func__, hparams.n_embd_inp());
7679        LLAMA_LOG_INFO("%s: n_layer               = %u\n",     __func__, hparams.n_layer);
7680        LLAMA_LOG_INFO("%s: n_head                = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
7681        LLAMA_LOG_INFO("%s: n_head_kv             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
7682        LLAMA_LOG_INFO("%s: n_rot                 = %u\n",     __func__, hparams.n_rot);
7683        LLAMA_LOG_INFO("%s: n_swa                 = %u\n",     __func__, hparams.n_swa);
7684        LLAMA_LOG_INFO("%s: is_swa_any            = %u\n",     __func__, hparams.is_swa_any());
7685        LLAMA_LOG_INFO("%s: n_embd_head_k         = %u\n",     __func__, hparams.n_embd_head_k);
7686        LLAMA_LOG_INFO("%s: n_embd_head_v         = %u\n",     __func__, hparams.n_embd_head_v);
7687        LLAMA_LOG_INFO("%s: n_gqa                 = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
7688        LLAMA_LOG_INFO("%s: n_embd_k_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
7689        LLAMA_LOG_INFO("%s: n_embd_v_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
7690        LLAMA_LOG_INFO("%s: f_norm_eps            = %.1e\n",   __func__, hparams.f_norm_eps);
7691        LLAMA_LOG_INFO("%s: f_norm_rms_eps        = %.1e\n",   __func__, hparams.f_norm_rms_eps);
7692        LLAMA_LOG_INFO("%s: f_clamp_kqv           = %.1e\n",   __func__, hparams.f_clamp_kqv);
7693        LLAMA_LOG_INFO("%s: f_max_alibi_bias      = %.1e\n",   __func__, hparams.f_max_alibi_bias);
7694        LLAMA_LOG_INFO("%s: f_logit_scale         = %.1e\n",   __func__, hparams.f_logit_scale);
7695        LLAMA_LOG_INFO("%s: f_attn_scale          = %.1e\n",   __func__, hparams.f_attention_scale);
7696        LLAMA_LOG_INFO("%s: n_ff                  = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
7697        LLAMA_LOG_INFO("%s: n_expert              = %u\n",     __func__, hparams.n_expert);
7698        LLAMA_LOG_INFO("%s: n_expert_used         = %u\n",     __func__, hparams.n_expert_used);
7699        LLAMA_LOG_INFO("%s: n_expert_groups       = %d\n",     __func__, hparams.n_expert_groups);
7700        LLAMA_LOG_INFO("%s: n_group_used          = %d\n",     __func__, hparams.n_group_used);
7701        LLAMA_LOG_INFO("%s: causal attn           = %d\n",     __func__, hparams.causal_attn);
7702        LLAMA_LOG_INFO("%s: pooling type          = %d\n",     __func__, hparams.pooling_type);
7703        LLAMA_LOG_INFO("%s: rope type             = %d\n",     __func__, hparams.rope_type);
7704        LLAMA_LOG_INFO("%s: rope scaling          = %s\n",     __func__, rope_scaling_type.c_str());
7705        LLAMA_LOG_INFO("%s: freq_base_train       = %.1f\n",   __func__, hparams.rope_freq_base_train);
7706        LLAMA_LOG_INFO("%s: freq_scale_train      = %g\n",     __func__, hparams.rope_freq_scale_train);
7707        if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7708            LLAMA_LOG_INFO("%s: freq_base_swa         = %.1f\n",   __func__, hparams.rope_freq_base_train_swa);
7709            LLAMA_LOG_INFO("%s: freq_scale_swa        = %g\n",     __func__, hparams.rope_freq_scale_train_swa);
7710        }
7711        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn       = %u\n",     __func__, hparams.n_ctx_orig_yarn);
7712        LLAMA_LOG_INFO("%s: rope_yarn_log_mul     = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
7713        LLAMA_LOG_INFO("%s: rope_finetuned        = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
7714        // MRoPE (Multi-axis Rotary Position Embedding) sections
7715        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
7716            LLAMA_LOG_INFO("%s: mrope sections        = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
7717        }
7718        if (!classifier_labels.empty()) {
7719            LLAMA_LOG_INFO("%s: n_cls_out             = %u\n", __func__, hparams.n_cls_out);
7720
7721            size_t i = 0;
7722            for (auto label : classifier_labels) {
7723                LLAMA_LOG_INFO("%s: cls_label[%2zu]         = %s\n", __func__, i++, label.c_str());
7724            }
7725        }
7726    }
7727
7728    if (arch == LLM_ARCH_MAMBA ||
7729        arch == LLM_ARCH_MAMBA2 ||
7730        arch == LLM_ARCH_JAMBA ||
7731        arch == LLM_ARCH_FALCON_H1 ||
7732        arch == LLM_ARCH_PLAMO2 ||
7733        arch == LLM_ARCH_GRANITE_HYBRID ||
7734        arch == LLM_ARCH_QWEN3NEXT ||
7735        arch == LLM_ARCH_QWEN35 ||
7736        arch == LLM_ARCH_QWEN35MOE ||
7737        arch == LLM_ARCH_NEMOTRON_H ||
7738        arch == LLM_ARCH_NEMOTRON_H_MOE) {
7739        LLAMA_LOG_INFO("%s: ssm_d_conv            = %u\n",     __func__, hparams.ssm_d_conv);
7740        LLAMA_LOG_INFO("%s: ssm_d_inner           = %u\n",     __func__, hparams.ssm_d_inner);
7741        LLAMA_LOG_INFO("%s: ssm_d_state           = %u\n",     __func__, hparams.ssm_d_state);
7742        LLAMA_LOG_INFO("%s: ssm_dt_rank           = %u\n",     __func__, hparams.ssm_dt_rank);
7743        LLAMA_LOG_INFO("%s: ssm_n_group           = %u\n",     __func__, hparams.ssm_n_group);
7744        LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms        = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
7745    }
7746
7747    LLAMA_LOG_INFO("%s: model type            = %s\n",     __func__, type_name().c_str());
7748    if (pimpl->n_elements >= 1e12) {
7749        LLAMA_LOG_INFO("%s: model params          = %.2f T\n", __func__, pimpl->n_elements*1e-12);
7750    } else if (pimpl->n_elements >= 1e9) {
7751        LLAMA_LOG_INFO("%s: model params          = %.2f B\n", __func__, pimpl->n_elements*1e-9);
7752    } else if (pimpl->n_elements >= 1e6) {
7753        LLAMA_LOG_INFO("%s: model params          = %.2f M\n", __func__, pimpl->n_elements*1e-6);
7754    } else {
7755        LLAMA_LOG_INFO("%s: model params          = %.2f K\n", __func__, pimpl->n_elements*1e-3);
7756    }
7757
7758    // general kv
7759    LLAMA_LOG_INFO("%s: general.name          = %s\n",    __func__, name.c_str());
7760
7761    if (arch == LLM_ARCH_DEEPSEEK) {
7762        LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
7763        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
7764        LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
7765        LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
7766    }
7767
7768    if (arch == LLM_ARCH_DEEPSEEK2) {
7769        LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
7770        LLAMA_LOG_INFO("%s: n_lora_q              = %d\n",     __func__, hparams.n_lora_q);
7771        LLAMA_LOG_INFO("%s: n_lora_kv             = %d\n",     __func__, hparams.n_lora_kv);
7772        LLAMA_LOG_INFO("%s: n_embd_head_k_mla     = %d\n",     __func__, hparams.n_embd_head_k_mla());
7773        LLAMA_LOG_INFO("%s: n_embd_head_v_mla     = %d\n",     __func__, hparams.n_embd_head_v_mla());
7774        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
7775        LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
7776        LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
7777        LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
7778        LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7779    }
7780
7781    if (arch == LLM_ARCH_QWEN2MOE) {
7782        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
7783        LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
7784    }
7785
7786    if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
7787        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
7788    }
7789
7790    if (arch == LLM_ARCH_MINICPM ||
7791        arch == LLM_ARCH_GRANITE ||
7792        arch == LLM_ARCH_GRANITE_MOE ||
7793        arch == LLM_ARCH_GRANITE_HYBRID ||
7794        arch == LLM_ARCH_NEMOTRON_H_MOE) {
7795        LLAMA_LOG_INFO("%s: f_embedding_scale     = %f\n", __func__, hparams.f_embedding_scale);
7796        LLAMA_LOG_INFO("%s: f_residual_scale      = %f\n", __func__, hparams.f_residual_scale);
7797        LLAMA_LOG_INFO("%s: f_attention_scale     = %f\n", __func__, hparams.f_attention_scale);
7798        LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n", __func__, hparams.n_ff_shexp);
7799    }
7800
7801    if (arch == LLM_ARCH_BAILINGMOE) {
7802        LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
7803        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
7804        LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
7805        LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
7806        LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
7807    }
7808
7809    if (arch == LLM_ARCH_BAILINGMOE2) {
7810        LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
7811        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
7812        LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
7813        LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
7814        LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
7815        LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
7816        LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7817        LLAMA_LOG_INFO("%s: nextn_predict_layers  = %d\n",     __func__, hparams.nextn_predict_layers);
7818    }
7819
7820    if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
7821        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
7822        LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7823    }
7824
7825    if (arch == LLM_ARCH_GROVEMOE) {
7826        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
7827        LLAMA_LOG_INFO("%s: n_ff_chexp            = %d\n",     __func__, hparams.n_ff_chexp);
7828        LLAMA_LOG_INFO("%s: n_group_experts       = %d\n",     __func__, hparams.n_group_experts);
7829        LLAMA_LOG_INFO("%s: expert_group_scale    = %.2f\n",   __func__, hparams.expert_group_scale);
7830    }
7831
7832    vocab.print_info();
7833}
7834
7835ggml_backend_dev_t llama_model::dev_layer(int il) const {
7836    return pimpl->dev_layer.at(il).dev;
7837}
7838
7839ggml_backend_dev_t llama_model::dev_output() const {
7840    return pimpl->dev_output.dev;
7841}
7842
7843template<typename F>
7844static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
7845    ggml_init_params params = {
7846        /*.mem_size   =*/ ggml_tensor_overhead()*8,
7847        /*.mem_buffer =*/ NULL,
7848        /*.no_alloc   =*/ true,
7849    };
7850
7851    ggml_context_ptr ctx { ggml_init(params) };
7852    if (!ctx) {
7853        throw std::runtime_error(format("failed to create ggml context"));
7854    }
7855
7856    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
7857    ggml_tensor * op_tensor = fn(ctx.get());
7858    for (int i = 0; i < GGML_MAX_SRC; i++) {
7859        if (op_tensor->src[i] != nullptr) {
7860            assert(op_tensor->src[i]->buffer == nullptr);
7861            op_tensor->src[i]->buffer = buf.get();
7862        }
7863    }
7864
7865    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
7866
7867    return op_supported;
7868}
7869
7870template<typename F>
7871static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
7872    for (const auto & cur : buft_list) {
7873        ggml_backend_dev_t cur_dev = cur.first;
7874        ggml_backend_buffer_type_t cur_buft = cur.second;
7875        if (buft_supported(cur_buft, cur_dev, fn)) {
7876            return cur_buft;
7877        }
7878    }
7879
7880    throw std::runtime_error(format("no suitable buffer type found"));
7881}
7882
7883ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
7884    return ::select_buft(
7885            *pimpl->dev_layer.at(il).buft_list,
7886            [&](ggml_context * ctx) {
7887                ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
7888                ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
7889                return ggml_add(ctx, cur, layer_dir);
7890            });
7891}
7892
7893bool llama_model::has_tensor_overrides() const {
7894    return pimpl->has_tensor_overrides;
7895}
7896
7897const ggml_tensor * llama_model::get_tensor(const char * name) const {
7898    auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
7899            [name](const std::pair<std::string, ggml_tensor *> & it) {
7900                return it.first == name;
7901            });
7902    if (it == tensors_by_name.end()) {
7903        return nullptr;
7904    }
7905
7906    return it->second;
7907}
7908
7909float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
7910    return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
7911}
7912
7913float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
7914    return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
7915}
7916
7917ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
7918    const uint32_t n_ctx_seq = cparams.n_ctx_seq;
7919
7920    // choose long/short freq factors based on the context size
7921    if (layers[il].rope_freqs != nullptr) {
7922        return layers[il].rope_freqs;
7923    }
7924
7925    if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
7926        return layers[il].rope_long;
7927    }
7928
7929    return layers[il].rope_short;
7930}
7931
7932llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
7933    llama_memory_i * res;
7934
7935    switch (arch) {
7936        // Models that need specific instantiation should be handled in the
7937        // switch statement
7938        case LLM_ARCH_BERT:
7939        case LLM_ARCH_JINA_BERT_V2:
7940        case LLM_ARCH_JINA_BERT_V3:
7941        case LLM_ARCH_NOMIC_BERT:
7942        case LLM_ARCH_NOMIC_BERT_MOE:
7943        case LLM_ARCH_NEO_BERT:
7944        case LLM_ARCH_WAVTOKENIZER_DEC:
7945        case LLM_ARCH_MODERN_BERT:
7946        case LLM_ARCH_GEMMA_EMBEDDING:
7947        case LLM_ARCH_DREAM:
7948        case LLM_ARCH_LLADA:
7949        case LLM_ARCH_LLADA_MOE:
7950        case LLM_ARCH_RND1:
7951            {
7952                res = nullptr;
7953            } break;
7954        // Models that need standard caching should rely on recurrent/hybrid
7955        // checks
7956        default:
7957            {
7958                if (llm_arch_is_recurrent(arch)) {
7959                    res = new llama_memory_recurrent(
7960                            *this,
7961                            GGML_TYPE_F32,
7962                            GGML_TYPE_F32,
7963                            cparams.offload_kqv,
7964                            std::max((uint32_t) 1, cparams.n_seq_max),
7965                            cparams.n_seq_max,
7966                            nullptr);
7967                } else if (llm_arch_is_hybrid(arch)) {
7968
7969                    // The main difference between hybrid architectures is the
7970                    // layer filters, so pick the right one here
7971                    llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
7972                    llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
7973                    if (arch == LLM_ARCH_FALCON_H1) {
7974                        filter_attn = [&](int32_t) { return true; };
7975                        filter_recr = [&](int32_t) { return true; };
7976                    } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
7977                        filter_attn = [&](int32_t il) {
7978                            return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
7979                        };
7980                        filter_recr = [&](int32_t il) {
7981                            return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
7982                        };
7983                    }
7984
7985                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7986                        // Use hybrid-iswa for hybrid models with SWA
7987                        res = new llama_memory_hybrid_iswa(
7988                            /* model             */ *this,
7989                            /* attn_type_k       */ params.type_k,
7990                            /* attn_type_v       */ params.type_v,
7991                            /* attn_v_trans      */ !cparams.flash_attn,
7992                            /* attn_swa_full     */ params.swa_full,
7993                            /* attn_kv_size      */ cparams.n_ctx,
7994                            /* attn_n_ubatch     */ cparams.n_ubatch,
7995                            /* attn_n_pad        */ 1,
7996                            /* recurrent_type_r  */ GGML_TYPE_F32,
7997                            /* recurrent_type_s  */ GGML_TYPE_F32,
7998                            /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
7999                            /* n_seq_max         */ cparams.n_seq_max,
8000                            /* offload           */ cparams.offload_kqv,
8001                            /* unified           */ cparams.kv_unified,
8002                            /* filter_attn       */ std::move(filter_attn),
8003                            /* filter_recr       */ std::move(filter_recr));
8004                    } else {
8005                        res = new llama_memory_hybrid(
8006                            /* model             */ *this,
8007                            /* attn_type_k       */ params.type_k,
8008                            /* attn_type_v       */ params.type_v,
8009                            /* attn_v_trans      */ !cparams.flash_attn,
8010                            /* attn_kv_size      */ cparams.n_ctx,
8011                            /* attn_n_pad        */ 1,
8012                            /* attn_n_swa        */ hparams.n_swa,
8013                            /* attn_swa_type     */ hparams.swa_type,
8014                            /* recurrent_type_k  */ GGML_TYPE_F32,
8015                            /* recurrent_type_v  */ GGML_TYPE_F32,
8016                            /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
8017                            /* n_seq_max         */ cparams.n_seq_max,
8018                            /* offload           */ cparams.offload_kqv,
8019                            /* unified           */ cparams.kv_unified,
8020                            /* filter_attn       */ std::move(filter_attn),
8021                            /* filter_recr       */ std::move(filter_recr));
8022                    }
8023                } else {
8024                    llama_memory_i::layer_reuse_cb reuse = nullptr;
8025
8026                    if (arch == LLM_ARCH_GEMMA3N) {
8027                        reuse = [&](int32_t il) {
8028                            if (il >= (int32_t) hparams.n_layer_kv_from_start) {
8029                                return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
8030                            }
8031
8032                            return -1;
8033                        };
8034                    }
8035
8036                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8037                        GGML_ASSERT(hparams.is_swa_any());
8038
8039                        res = new llama_kv_cache_iswa(
8040                                *this,
8041                                params.type_k,
8042                                params.type_v,
8043                                !cparams.flash_attn,
8044                                cparams.offload_kqv,
8045                                params.swa_full,
8046                                cparams.kv_unified,
8047                                cparams.n_ctx_seq,
8048                                cparams.n_seq_max,
8049                                cparams.n_ubatch,
8050                                1,
8051                                nullptr,
8052                                reuse);
8053                    } else {
8054                        GGML_ASSERT(!hparams.is_swa_any());
8055
8056                        res = new llama_kv_cache(
8057                                *this,
8058                                params.type_k,
8059                                params.type_v,
8060                                !cparams.flash_attn,
8061                                cparams.offload_kqv,
8062                                cparams.kv_unified,
8063                                cparams.n_ctx_seq,
8064                                cparams.n_seq_max,
8065                                1,
8066                                hparams.n_swa,
8067                                hparams.swa_type,
8068                                nullptr,
8069                                nullptr);
8070                    }
8071                }
8072            }
8073    }
8074
8075    return res;
8076}
8077
8078ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
8079    std::unique_ptr<llm_graph_context> llm;
8080
8081    switch (arch) {
8082        case LLM_ARCH_LLAMA:
8083            {
8084                llm = std::make_unique<llm_build_llama<false>>(*this, params);
8085            } break;
8086        case LLM_ARCH_LLAMA4:
8087            {
8088                if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
8089                    llm = std::make_unique<llm_build_llama<false>>(*this, params);
8090                } else {
8091                    llm = std::make_unique<llm_build_llama_iswa>(*this, params);
8092                }
8093            } break;
8094        case LLM_ARCH_LLAMA_EMBED:
8095            {
8096                llm = std::make_unique<llm_build_llama<true>>(*this, params);
8097            } break;
8098        case LLM_ARCH_MAINCODER:
8099            {
8100                llm = std::make_unique<llm_build_maincoder>(*this, params);
8101            } break;
8102        case LLM_ARCH_DECI:
8103            {
8104                llm = std::make_unique<llm_build_deci>(*this, params);
8105            } break;
8106        case LLM_ARCH_BAICHUAN:
8107            {
8108                llm = std::make_unique<llm_build_baichuan>(*this, params);
8109            } break;
8110        case LLM_ARCH_FALCON:
8111            {
8112                llm = std::make_unique<llm_build_falcon>(*this, params);
8113            } break;
8114        case LLM_ARCH_GROK:
8115            {
8116                llm = std::make_unique<llm_build_grok>(*this, params);
8117            } break;
8118        case LLM_ARCH_STARCODER:
8119            {
8120                llm = std::make_unique<llm_build_starcoder>(*this, params);
8121            } break;
8122        case LLM_ARCH_REFACT:
8123            {
8124                llm = std::make_unique<llm_build_refact>(*this, params);
8125            } break;
8126        case LLM_ARCH_BERT:
8127        case LLM_ARCH_JINA_BERT_V2:
8128        case LLM_ARCH_JINA_BERT_V3:
8129        case LLM_ARCH_NOMIC_BERT:
8130        case LLM_ARCH_NOMIC_BERT_MOE:
8131            {
8132                llm = std::make_unique<llm_build_bert>(*this, params);
8133            } break;
8134        case LLM_ARCH_MODERN_BERT:
8135            {
8136                llm = std::make_unique<llm_build_modern_bert>(*this, params);
8137            } break;
8138        case LLM_ARCH_NEO_BERT:
8139            {
8140                llm = std::make_unique<llm_build_neo_bert>(*this, params);
8141            } break;
8142        case LLM_ARCH_BLOOM:
8143            {
8144                llm = std::make_unique<llm_build_bloom>(*this, params);
8145            } break;
8146        case LLM_ARCH_MPT:
8147            {
8148                llm = std::make_unique<llm_build_mpt>(*this, params);
8149            } break;
8150        case LLM_ARCH_STABLELM:
8151            {
8152                llm = std::make_unique<llm_build_stablelm>(*this, params);
8153            } break;
8154        case LLM_ARCH_QWEN:
8155            {
8156                llm = std::make_unique<llm_build_qwen>(*this, params);
8157            } break;
8158        case LLM_ARCH_QWEN2:
8159            {
8160                llm = std::make_unique<llm_build_qwen2>(*this, params);
8161            } break;
8162        case LLM_ARCH_DREAM:
8163            {
8164                llm = std::make_unique<llm_build_dream>(*this, params);
8165            }
8166            break;
8167        case LLM_ARCH_LLADA:
8168            {
8169                llm = std::make_unique<llm_build_llada>(*this, params);
8170            }
8171            break;
8172        case LLM_ARCH_LLADA_MOE:
8173            {
8174                llm = std::make_unique<llm_build_llada_moe>(*this, params);
8175            }
8176            break;
8177        case LLM_ARCH_RND1:
8178            {
8179                llm = std::make_unique<llm_build_rnd1>(*this, params);
8180            }
8181            break;
8182        case LLM_ARCH_QWEN2VL:
8183            {
8184                llm = std::make_unique<llm_build_qwen2vl>(*this, params);
8185            } break;
8186        case LLM_ARCH_QWEN2MOE:
8187            {
8188                llm = std::make_unique<llm_build_qwen2moe>(*this, params);
8189            } break;
8190        case LLM_ARCH_QWEN3:
8191            {
8192                llm = std::make_unique<llm_build_qwen3>(*this, params);
8193            } break;
8194        case LLM_ARCH_QWEN3MOE:
8195            {
8196                llm = std::make_unique<llm_build_qwen3moe>(*this, params);
8197            } break;
8198        case LLM_ARCH_QWEN3VL:
8199            {
8200                llm = std::make_unique<llm_build_qwen3vl>(*this, params);
8201            } break;
8202        case LLM_ARCH_QWEN3VLMOE:
8203            {
8204                llm = std::make_unique<llm_build_qwen3vlmoe>(*this, params);
8205            } break;
8206        case LLM_ARCH_PHI2:
8207            {
8208                llm = std::make_unique<llm_build_phi2>(*this, params);
8209            } break;
8210        case LLM_ARCH_PHI3:
8211        case LLM_ARCH_PHIMOE:
8212            {
8213                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8214                    llm = std::make_unique<llm_build_phi3<true>> (*this, params);
8215                } else {
8216                    llm = std::make_unique<llm_build_phi3<false>>(*this, params);
8217                }
8218            } break;
8219        case LLM_ARCH_PLAMO:
8220            {
8221                llm = std::make_unique<llm_build_plamo>(*this, params);
8222            } break;
8223        case LLM_ARCH_PLAMO2:
8224            {
8225                llm = std::make_unique<llm_build_plamo2>(*this, params);
8226            } break;
8227        case LLM_ARCH_PLAMO3:
8228            {
8229                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8230                    llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
8231                } else {
8232                    llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
8233                }
8234            } break;
8235        case LLM_ARCH_GPT2:
8236            {
8237                llm = std::make_unique<llm_build_gpt2>(*this, params);
8238            } break;
8239        case LLM_ARCH_CODESHELL:
8240            {
8241                llm = std::make_unique<llm_build_codeshell>(*this, params);
8242            } break;
8243        case LLM_ARCH_ORION:
8244            {
8245                llm = std::make_unique<llm_build_orion>(*this, params);
8246            } break;
8247        case LLM_ARCH_INTERNLM2:
8248            {
8249                llm = std::make_unique<llm_build_internlm2>(*this, params);
8250            } break;
8251        case LLM_ARCH_MINICPM3:
8252            {
8253                llm = std::make_unique<llm_build_minicpm3>(*this, params);
8254            } break;
8255        case LLM_ARCH_GEMMA:
8256            {
8257                llm = std::make_unique<llm_build_gemma>(*this, params);
8258            } break;
8259        case LLM_ARCH_GEMMA2:
8260            {
8261                llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
8262            } break;
8263        case LLM_ARCH_GEMMA3:
8264            {
8265                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
8266                    llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
8267                } else {
8268                    llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
8269                }
8270            } break;
8271        case LLM_ARCH_GEMMA3N:
8272            {
8273                llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
8274            } break;
8275        case LLM_ARCH_GEMMA_EMBEDDING:
8276            {
8277                llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
8278            } break;
8279        case LLM_ARCH_STARCODER2:
8280            {
8281                llm = std::make_unique<llm_build_starcoder2>(*this, params);
8282            } break;
8283        case LLM_ARCH_MAMBA:
8284        case LLM_ARCH_MAMBA2:
8285            {
8286                llm = std::make_unique<llm_build_mamba>(*this, params);
8287            } break;
8288        case LLM_ARCH_JAMBA:
8289            {
8290                llm = std::make_unique<llm_build_jamba>(*this, params);
8291            } break;
8292        case LLM_ARCH_XVERSE:
8293            {
8294                llm = std::make_unique<llm_build_xverse>(*this, params);
8295            } break;
8296        case LLM_ARCH_COMMAND_R:
8297            {
8298                llm = std::make_unique<llm_build_command_r>(*this, params);
8299            } break;
8300        case LLM_ARCH_COHERE2:
8301            {
8302                llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
8303            } break;
8304        case LLM_ARCH_DBRX:
8305            {
8306                llm = std::make_unique<llm_build_dbrx>(*this, params);
8307            } break;
8308        case LLM_ARCH_OLMO:
8309            {
8310                llm = std::make_unique<llm_build_olmo>(*this, params);
8311            } break;
8312        case LLM_ARCH_OLMO2:
8313            {
8314                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
8315                    llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
8316                } else {
8317                    llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
8318                }
8319            } break;
8320        case LLM_ARCH_OLMOE:
8321            {
8322                llm = std::make_unique<llm_build_olmoe>(*this, params);
8323            } break;
8324        case LLM_ARCH_OPENELM:
8325            {
8326                llm = std::make_unique<llm_build_openelm>(*this, params);
8327            } break;
8328        case LLM_ARCH_GPTNEOX:
8329            {
8330                llm = std::make_unique<llm_build_gptneox>(*this, params);
8331            } break;
8332        case LLM_ARCH_ARCTIC:
8333            {
8334                llm = std::make_unique<llm_build_arctic>(*this, params);
8335            } break;
8336        case LLM_ARCH_DEEPSEEK:
8337            {
8338                llm = std::make_unique<llm_build_deepseek>(*this, params);
8339            } break;
8340        case LLM_ARCH_DEEPSEEK2:
8341            {
8342                llm = std::make_unique<llm_build_deepseek2>(*this, params);
8343            } break;
8344        case LLM_ARCH_CHATGLM:
8345            {
8346                llm = std::make_unique<llm_build_chatglm>(*this, params);
8347            } break;
8348        case LLM_ARCH_GLM4:
8349            {
8350                llm = std::make_unique<llm_build_glm4>(*this, params);
8351            } break;
8352        case LLM_ARCH_GLM4_MOE:
8353            {
8354                llm = std::make_unique<llm_build_glm4_moe>(*this, params);
8355            } break;
8356        case LLM_ARCH_BITNET:
8357            {
8358                llm = std::make_unique<llm_build_bitnet>(*this, params);
8359            } break;
8360        case LLM_ARCH_T5:
8361            {
8362                switch (params.gtype) {
8363                    case LLM_GRAPH_TYPE_ENCODER:
8364                        llm = std::make_unique<llm_build_t5_enc>(*this, params);
8365                        break;
8366                    case LLM_GRAPH_TYPE_DEFAULT:
8367                    case LLM_GRAPH_TYPE_DECODER:
8368                        llm = std::make_unique<llm_build_t5_dec>(*this, params);
8369                        break;
8370                    default:
8371                        GGML_ABORT("invalid graph type");
8372                };
8373            } break;
8374        case LLM_ARCH_T5ENCODER:
8375            {
8376                llm = std::make_unique<llm_build_t5_enc>(*this, params);
8377            }
8378            break;
8379        case LLM_ARCH_JAIS:
8380            {
8381                llm = std::make_unique<llm_build_jais>(*this, params);
8382            } break;
8383        case LLM_ARCH_NEMOTRON:
8384            {
8385                llm = std::make_unique<llm_build_nemotron>(*this, params);
8386            } break;
8387        case LLM_ARCH_NEMOTRON_H:
8388        case LLM_ARCH_NEMOTRON_H_MOE:
8389            {
8390                llm = std::make_unique<llm_build_nemotron_h>(*this, params);
8391            } break;
8392        case LLM_ARCH_EXAONE:
8393            {
8394                llm = std::make_unique<llm_build_exaone>(*this, params);
8395            } break;
8396        case LLM_ARCH_EXAONE4:
8397            {
8398                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
8399                    llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
8400                } else {
8401                    llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
8402                }
8403            } break;
8404        case LLM_ARCH_EXAONE_MOE:
8405            {
8406                llm = std::make_unique<llm_build_exaone_moe>(*this, params);
8407            } break;
8408        case LLM_ARCH_RWKV6:
8409            {
8410                llm = std::make_unique<llm_build_rwkv6>(*this, params);
8411            } break;
8412        case LLM_ARCH_RWKV6QWEN2:
8413            {
8414                llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
8415            } break;
8416        case LLM_ARCH_RWKV7:
8417            {
8418                llm = std::make_unique<llm_build_rwkv7>(*this, params);
8419            } break;
8420        case LLM_ARCH_ARWKV7:
8421            {
8422                llm = std::make_unique<llm_build_arwkv7>(*this, params);
8423            } break;
8424        case LLM_ARCH_GRANITE:
8425        case LLM_ARCH_GRANITE_MOE:
8426        case LLM_ARCH_MINICPM:
8427            {
8428                llm = std::make_unique<llm_build_granite>(*this, params);
8429            } break;
8430        case LLM_ARCH_GRANITE_HYBRID:
8431            {
8432                llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
8433            } break;
8434        case LLM_ARCH_CHAMELEON:
8435            {
8436                llm = std::make_unique<llm_build_chameleon>(*this, params);
8437            } break;
8438        case LLM_ARCH_WAVTOKENIZER_DEC:
8439            {
8440                llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
8441            } break;
8442        case LLM_ARCH_PLM:
8443            {
8444                llm = std::make_unique<llm_build_plm>(*this, params);
8445            } break;
8446        case LLM_ARCH_BAILINGMOE:
8447            {
8448                llm = std::make_unique<llm_build_bailingmoe>(*this, params);
8449            } break;
8450        case LLM_ARCH_BAILINGMOE2:
8451            {
8452                llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
8453            } break;
8454        case LLM_ARCH_SEED_OSS:
8455            {
8456                llm = std::make_unique<llm_build_seed_oss>(*this, params);
8457            } break;
8458        case LLM_ARCH_DOTS1:
8459            {
8460                llm = std::make_unique<llm_build_dots1>(*this, params);
8461            } break;
8462        case LLM_ARCH_ARCEE:
8463            {
8464                llm = std::make_unique<llm_build_arcee>(*this, params);
8465            } break;
8466        case LLM_ARCH_AFMOE:
8467            {
8468                llm = std::make_unique<llm_build_afmoe>(*this, params);
8469            } break;
8470        case LLM_ARCH_ERNIE4_5:
8471            {
8472                llm = std::make_unique<llm_build_ernie4_5>(*this, params);
8473            } break;
8474        case LLM_ARCH_ERNIE4_5_MOE:
8475            {
8476                llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
8477            } break;
8478        case LLM_ARCH_HUNYUAN_MOE:
8479            {
8480                llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
8481            } break;
8482        case LLM_ARCH_HUNYUAN_DENSE:
8483            {
8484                llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
8485            } break;
8486        case LLM_ARCH_SMOLLM3:
8487            {
8488                llm = std::make_unique<llm_build_smollm3>(*this, params);
8489            } break;
8490        case LLM_ARCH_OPENAI_MOE:
8491            {
8492                llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
8493            } break;
8494        case LLM_ARCH_FALCON_H1:
8495            {
8496                llm = std::make_unique<llm_build_falcon_h1>(*this, params);
8497            } break;
8498        case LLM_ARCH_LFM2:
8499        case LLM_ARCH_LFM2MOE:
8500            {
8501                llm = std::make_unique<llm_build_lfm2>(*this, params);
8502            } break;
8503        case LLM_ARCH_SMALLTHINKER:
8504            {
8505                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
8506                    llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
8507                } else {
8508                    llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
8509                }
8510            } break;
8511        case LLM_ARCH_GROVEMOE:
8512            {
8513                llm = std::make_unique<llm_build_grovemoe>(*this, params);
8514            } break;
8515        case LLM_ARCH_APERTUS:
8516            {
8517                llm = std::make_unique<llm_build_apertus>(*this, params);
8518            } break;
8519        case LLM_ARCH_MINIMAX_M2:
8520            {
8521                llm = std::make_unique<llm_build_minimax_m2>(*this, params);
8522            } break;
8523        case LLM_ARCH_COGVLM:
8524            {
8525                llm = std::make_unique<llm_build_cogvlm>(*this, params);
8526            } break;
8527        case LLM_ARCH_PANGU_EMBED:
8528            {
8529                llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
8530            } break;
8531        case LLM_ARCH_QWEN3NEXT:
8532            {
8533                llm = std::make_unique<llm_build_qwen3next>(*this, params);
8534            } break;
8535        case LLM_ARCH_QWEN35:
8536            {
8537                llm = std::make_unique<llm_build_qwen35>(*this, params);
8538            } break;
8539        case LLM_ARCH_QWEN35MOE:
8540            {
8541                llm = std::make_unique<llm_build_qwen35moe>(*this, params);
8542            } break;
8543        case LLM_ARCH_MISTRAL3:
8544            {
8545                llm = std::make_unique<llm_build_mistral3>(*this, params);
8546            } break;
8547        case LLM_ARCH_MIMO2:
8548            {
8549                llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
8550            } break;
8551        case LLM_ARCH_KIMI_LINEAR:
8552            {
8553                llm = std::make_unique<llm_build_kimi_linear>(*this, params);
8554            } break;
8555        case LLM_ARCH_STEP35:
8556            {
8557                llm = std::make_unique<llm_build_step35_iswa>(*this, params);
8558            } break;
8559        default:
8560            GGML_ABORT("fatal error");
8561    }
8562
8563    // add on pooling layer
8564    llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
8565
8566    // add backend sampling layers (if any)
8567    llm->build_sampling();
8568
8569    // if the gguf model was converted with --sentence-transformers-dense-modules
8570    // there will be two additional dense projection layers
8571    // dense linear projections are applied after pooling
8572    // TODO: move reranking logic here and generalize
8573    llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
8574
8575    llm->res->set_outputs();
8576
8577    return llm->res->get_gf();
8578}
8579
8580
8581//
8582// interface implementation
8583//
8584
8585llama_model_params llama_model_default_params() {
8586    llama_model_params result = {
8587        /*.devices                     =*/ nullptr,
8588        /*.tensor_buft_overrides       =*/ nullptr,
8589        /*.n_gpu_layers                =*/ -1,
8590        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
8591        /*.main_gpu                    =*/ 0,
8592        /*.tensor_split                =*/ nullptr,
8593        /*.progress_callback           =*/ nullptr,
8594        /*.progress_callback_user_data =*/ nullptr,
8595        /*.kv_overrides                =*/ nullptr,
8596        /*.vocab_only                  =*/ false,
8597        /*.use_mmap                    =*/ true,
8598        /*.use_direct_io               =*/ false,
8599        /*.use_mlock                   =*/ false,
8600        /*.check_tensors               =*/ false,
8601        /*.use_extra_bufts             =*/ true,
8602        /*.no_host                     =*/ false,
8603        /*.no_alloc                    =*/ false,
8604    };
8605
8606    return result;
8607}
8608
8609const llama_vocab * llama_model_get_vocab(const llama_model * model) {
8610    return &model->vocab;
8611}
8612
8613void llama_free_model(llama_model * model) {
8614    llama_model_free(model);
8615}
8616
8617void llama_model_free(llama_model * model) {
8618    delete model;
8619}
8620
8621int32_t llama_model_n_ctx_train(const llama_model * model) {
8622    return model->hparams.n_ctx_train;
8623}
8624
8625int32_t llama_model_n_embd(const llama_model * model) {
8626    return model->hparams.n_embd;
8627}
8628
8629int32_t llama_model_n_embd_inp(const llama_model * model) {
8630    return model->hparams.n_embd_inp();
8631}
8632
8633int32_t llama_model_n_embd_out(const llama_model * model) {
8634    return model->hparams.n_embd_out();
8635}
8636
8637int32_t llama_model_n_layer(const llama_model * model) {
8638    return model->hparams.n_layer;
8639}
8640
8641int32_t llama_model_n_head(const llama_model * model) {
8642    return model->hparams.n_head();
8643}
8644
8645int32_t llama_model_n_head_kv(const llama_model * model) {
8646    return model->hparams.n_head_kv();
8647}
8648
8649int32_t llama_model_n_swa(const llama_model * model) {
8650    return model->hparams.n_swa;
8651}
8652
8653uint32_t llama_model_n_cls_out(const struct llama_model * model) {
8654    return model->hparams.n_cls_out;
8655}
8656
8657const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
8658    if (i < model->classifier_labels.size()) {
8659        return model->classifier_labels[i].c_str();
8660    }
8661
8662    return nullptr;
8663}
8664
8665// deprecated
8666int32_t llama_n_ctx_train(const llama_model * model) {
8667    return llama_model_n_ctx_train(model);
8668}
8669
8670// deprecated
8671int32_t llama_n_embd(const llama_model * model) {
8672    return llama_model_n_embd(model);
8673}
8674
8675// deprecated
8676int32_t llama_n_layer(const llama_model * model) {
8677    return llama_model_n_layer(model);
8678}
8679
8680// deprecated
8681int32_t llama_n_head(const llama_model * model) {
8682    return llama_model_n_head(model);
8683}
8684
8685llama_rope_type llama_model_rope_type(const llama_model * model) {
8686    switch (model->arch) {
8687        // these models do not use RoPE
8688        case LLM_ARCH_CLIP:
8689        case LLM_ARCH_GPT2:
8690        case LLM_ARCH_GPTJ:
8691        case LLM_ARCH_MPT:
8692        case LLM_ARCH_REFACT:
8693        case LLM_ARCH_BLOOM:
8694        case LLM_ARCH_MAMBA:
8695        case LLM_ARCH_MAMBA2:
8696        case LLM_ARCH_JAMBA:
8697        case LLM_ARCH_JINA_BERT_V2:
8698        case LLM_ARCH_T5:
8699        case LLM_ARCH_T5ENCODER:
8700        case LLM_ARCH_JAIS:
8701        case LLM_ARCH_RWKV6:
8702        case LLM_ARCH_RWKV6QWEN2:
8703        case LLM_ARCH_RWKV7:
8704        case LLM_ARCH_ARWKV7:
8705        case LLM_ARCH_WAVTOKENIZER_DEC:
8706        case LLM_ARCH_NEMOTRON_H:
8707        case LLM_ARCH_NEMOTRON_H_MOE:
8708        case LLM_ARCH_KIMI_LINEAR:
8709            return LLAMA_ROPE_TYPE_NONE;
8710
8711        // use what we call a normal RoPE, operating on pairs of consecutive head values
8712        case LLM_ARCH_LLAMA:
8713        case LLM_ARCH_LLADA:
8714        case LLM_ARCH_LLAMA4:
8715        case LLM_ARCH_DECI:
8716        case LLM_ARCH_BAICHUAN:
8717        case LLM_ARCH_STARCODER:
8718        case LLM_ARCH_INTERNLM2:
8719        case LLM_ARCH_MINICPM:
8720        case LLM_ARCH_XVERSE:
8721        case LLM_ARCH_COMMAND_R:
8722        case LLM_ARCH_COHERE2:
8723        case LLM_ARCH_OLMO:
8724        case LLM_ARCH_ARCTIC:
8725        case LLM_ARCH_DEEPSEEK:
8726        case LLM_ARCH_DEEPSEEK2:
8727        case LLM_ARCH_PLM:
8728        case LLM_ARCH_CHATGLM:
8729        case LLM_ARCH_GRANITE:
8730        case LLM_ARCH_GRANITE_MOE:
8731        case LLM_ARCH_GRANITE_HYBRID:
8732        case LLM_ARCH_CHAMELEON:
8733        case LLM_ARCH_BAILINGMOE:
8734        case LLM_ARCH_NEO_BERT:
8735        case LLM_ARCH_SMOLLM3:
8736        case LLM_ARCH_ARCEE:
8737        case LLM_ARCH_ERNIE4_5:
8738        case LLM_ARCH_ERNIE4_5_MOE:
8739        case LLM_ARCH_MISTRAL3:
8740        case LLM_ARCH_LLAMA_EMBED:
8741        case LLM_ARCH_MAINCODER:
8742            return LLAMA_ROPE_TYPE_NORM;
8743
8744        // the pairs of head values are offset by n_rot/2
8745        case LLM_ARCH_FALCON:
8746        case LLM_ARCH_FALCON_H1:
8747        case LLM_ARCH_GROK:
8748        case LLM_ARCH_DBRX:
8749        case LLM_ARCH_BERT:
8750        case LLM_ARCH_JINA_BERT_V3:
8751        case LLM_ARCH_MODERN_BERT:
8752        case LLM_ARCH_NOMIC_BERT:
8753        case LLM_ARCH_NOMIC_BERT_MOE:
8754        case LLM_ARCH_STABLELM:
8755        case LLM_ARCH_BITNET:
8756        case LLM_ARCH_QWEN:
8757        case LLM_ARCH_QWEN2:
8758        case LLM_ARCH_DREAM:
8759        case LLM_ARCH_QWEN2MOE:
8760        case LLM_ARCH_QWEN3:
8761        case LLM_ARCH_QWEN3MOE:
8762        case LLM_ARCH_LLADA_MOE:
8763        case LLM_ARCH_RND1:
8764        case LLM_ARCH_OLMO2:
8765        case LLM_ARCH_OLMOE:
8766        case LLM_ARCH_PHI2:
8767        case LLM_ARCH_PHI3:
8768        case LLM_ARCH_PHIMOE:
8769        case LLM_ARCH_PLAMO:
8770        case LLM_ARCH_PLAMO2:
8771        case LLM_ARCH_PLAMO3:
8772        case LLM_ARCH_GEMMA:
8773        case LLM_ARCH_GEMMA2:
8774        case LLM_ARCH_GEMMA3:
8775        case LLM_ARCH_GEMMA3N:
8776        case LLM_ARCH_GEMMA_EMBEDDING:
8777        case LLM_ARCH_STARCODER2:
8778        case LLM_ARCH_OPENELM:
8779        case LLM_ARCH_GPTNEOX:
8780        case LLM_ARCH_CODESHELL:
8781        case LLM_ARCH_ORION:
8782        case LLM_ARCH_NEMOTRON:
8783        case LLM_ARCH_EXAONE:
8784        case LLM_ARCH_EXAONE4:
8785        case LLM_ARCH_EXAONE_MOE:
8786        case LLM_ARCH_MINICPM3:
8787        case LLM_ARCH_BAILINGMOE2:
8788        case LLM_ARCH_DOTS1:
8789        case LLM_ARCH_HUNYUAN_MOE:
8790        case LLM_ARCH_OPENAI_MOE:
8791        case LLM_ARCH_HUNYUAN_DENSE:
8792        case LLM_ARCH_LFM2:
8793        case LLM_ARCH_LFM2MOE:
8794        case LLM_ARCH_SMALLTHINKER:
8795        case LLM_ARCH_SEED_OSS:
8796        case LLM_ARCH_GROVEMOE:
8797        case LLM_ARCH_APERTUS:
8798        case LLM_ARCH_MINIMAX_M2:
8799        case LLM_ARCH_COGVLM:
8800        case LLM_ARCH_PANGU_EMBED:
8801        case LLM_ARCH_AFMOE:
8802        case LLM_ARCH_QWEN3NEXT:
8803        case LLM_ARCH_MIMO2:
8804        case LLM_ARCH_STEP35:
8805            return LLAMA_ROPE_TYPE_NEOX;
8806
8807        case LLM_ARCH_QWEN2VL:
8808            return LLAMA_ROPE_TYPE_MROPE;
8809        case LLM_ARCH_QWEN3VL:
8810        case LLM_ARCH_QWEN3VLMOE:
8811        case LLM_ARCH_QWEN35:
8812        case LLM_ARCH_QWEN35MOE:
8813            return LLAMA_ROPE_TYPE_IMROPE;
8814
8815        case LLM_ARCH_GLM4:
8816            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
8817        case LLM_ARCH_GLM4_MOE:
8818            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
8819
8820        // all model arches should be listed explicitly here
8821        case LLM_ARCH_UNKNOWN:
8822            GGML_ABORT("unknown architecture");
8823    }
8824
8825    return LLAMA_ROPE_TYPE_NONE;
8826}
8827
8828float llama_model_rope_freq_scale_train(const llama_model * model) {
8829    return model->hparams.rope_freq_scale_train;
8830}
8831
8832int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
8833    const auto & it = model->gguf_kv.find(key);
8834    if (it == model->gguf_kv.end()) {
8835        if (buf_size > 0) {
8836            buf[0] = '\0';
8837        }
8838        return -1;
8839    }
8840    return snprintf(buf, buf_size, "%s", it->second.c_str());
8841}
8842
8843int32_t llama_model_meta_count(const llama_model * model) {
8844    return (int)model->gguf_kv.size();
8845}
8846
8847const char * llama_model_meta_key_str(llama_model_meta_key key) {
8848    switch (key) {
8849        case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE:        return "general.sampling.sequence";
8850        case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K:           return "general.sampling.top_k";
8851        case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P:           return "general.sampling.top_p";
8852        case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P:           return "general.sampling.min_p";
8853        case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
8854        case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD:   return "general.sampling.xtc_threshold";
8855        case LLAMA_MODEL_META_KEY_SAMPLING_TEMP:            return "general.sampling.temp";
8856        case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N:  return "general.sampling.penalty_last_n";
8857        case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT:  return "general.sampling.penalty_repeat";
8858        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT:        return "general.sampling.mirostat";
8859        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU:    return "general.sampling.mirostat_tau";
8860        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA:    return "general.sampling.mirostat_eta";
8861        default:                                            return nullptr;
8862    }
8863}
8864
8865int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
8866    if (i < 0 || i >= (int)model->gguf_kv.size()) {
8867        if (buf_size > 0) {
8868            buf[0] = '\0';
8869        }
8870        return -1;
8871    }
8872    auto it = model->gguf_kv.begin();
8873    std::advance(it, i);
8874    return snprintf(buf, buf_size, "%s", it->first.c_str());
8875}
8876
8877int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
8878    if (i < 0 || i >= (int)model->gguf_kv.size()) {
8879        if (buf_size > 0) {
8880            buf[0] = '\0';
8881        }
8882        return -1;
8883    }
8884    auto it = model->gguf_kv.begin();
8885    std::advance(it, i);
8886    return snprintf(buf, buf_size, "%s", it->second.c_str());
8887}
8888
8889int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
8890    return snprintf(buf, buf_size, "%s", model->desc().c_str());
8891}
8892
8893uint64_t llama_model_size(const llama_model * model) {
8894    return model->size();
8895}
8896
8897const char * llama_model_chat_template(const llama_model * model, const char * name) {
8898    const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
8899        : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
8900    const auto & it = model->gguf_kv.find(key);
8901    if (it == model->gguf_kv.end()) {
8902        // one-off fix for very popular models (so we are not flooded with issues)
8903        // do not extend this list unless absolutely necessary
8904        // Mistral-Small-2503 does not have built-in chat template
8905        llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
8906        if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
8907            return "mistral-v7-tekken";
8908        }
8909
8910        return nullptr;
8911    }
8912
8913    return it->second.c_str();
8914}
8915
8916uint64_t llama_model_n_params(const llama_model * model) {
8917    return model->n_elements();
8918}
8919
8920bool llama_model_has_encoder(const llama_model * model) {
8921    switch (model->arch) {
8922        case LLM_ARCH_T5:        return true;
8923        case LLM_ARCH_T5ENCODER: return true;
8924        default:                 return false;
8925    }
8926}
8927
8928bool llama_model_has_decoder(const llama_model * model) {
8929    switch (model->arch) {
8930        case LLM_ARCH_T5ENCODER: return false;
8931        default:                 return true;
8932    }
8933}
8934
8935llama_token llama_model_decoder_start_token(const llama_model * model) {
8936    return model->hparams.dec_start_token_id;
8937}
8938
8939bool llama_model_is_recurrent(const llama_model * model) {
8940    return llm_arch_is_recurrent(model->arch);
8941}
8942
8943bool llama_model_is_hybrid(const llama_model * model) {
8944    return llm_arch_is_hybrid(model->arch);
8945}
8946
8947bool llama_model_is_diffusion(const llama_model * model) {
8948    return llm_arch_is_diffusion(model->arch);
8949}
8950
8951const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
8952    return model->tensors_by_name;
8953}