archive llama.cpp-b8008.tar.gz
corpus lotr.txt map1_bromm.txt map1_dagna.txt map1_keldor.txt map1_skara.txt map1_thrain.txt
llama.cpp
.devops
nix apps.nix devshells.nix docker.nix jetson-support.nix nixpkgs-instances.nix package-gguf-py.nix package.nix python-scripts.nix scope.nix sif.nix
cann.Dockerfile cpu.Dockerfile cuda-new.Dockerfile cuda.Dockerfile intel.Dockerfile llama-cli-cann.Dockerfile llama-cpp-cuda.srpm.spec llama-cpp.srpm.spec musa.Dockerfile rocm.Dockerfile s390x.Dockerfile tools.sh vulkan.Dockerfile
.gemini settings.json
.github
ISSUE_TEMPLATE 010-bug-compilation.yml 011-bug-results.yml 019-bug-misc.yml 020-enhancement.yml 030-research.yml 040-refactor.yml config.yml
actions
get-tag-name action.yml
install-exe action.yml
linux-setup-spacemit action.yml
linux-setup-vulkan action.yml
unarchive-tar action.yml
windows-setup-cuda action.yml
windows-setup-rocm action.yml
workflows bench.yml.disabled build-cache.yml build-cmake-pkg.yml build-linux-cross.yml build.yml check-vendor.yml close-issue.yml copilot-setup-steps.yml docker.yml editorconfig.yml gguf-publish.yml labeler.yml pre-tokenizer-hashes.yml python-check-requirements.yml python-lint.yml python-type-check.yml release.yml server-metal.yml server-webui.yml server.yml update-ops-docs.yml winget.yml
labeler.yml pull_request_template.md
benches
dgx-spark aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.html aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.json aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547_allresults.json dgx-spark.md
mac-m2-ultra mac-m2-ultra.md
ci README-MUSA.md README.md run.sh
cmake arm64-apple-clang.cmake arm64-windows-llvm.cmake build-info.cmake common.cmake download-models.cmake git-vars.cmake license.cmake llama-config.cmake.in llama.pc.in riscv64-spacemit-linux-gnu-gcc.cmake x64-windows-llvm.cmake
common
jinja README.md caps.cpp caps.h lexer.cpp lexer.h parser.cpp parser.h runtime.cpp runtime.h string.cpp string.h utils.h value.cpp value.h
CMakeLists.txt arg.cpp arg.h base64.hpp build-info.cpp.in chat-parser-xml-toolcall.cpp chat-parser-xml-toolcall.h chat-parser.cpp chat-parser.h chat-peg-parser.cpp chat-peg-parser.h chat.cpp chat.h common.cpp common.h console.cpp console.h debug.cpp debug.h download.cpp download.h http.h json-partial.cpp json-partial.h json-schema-to-grammar.cpp json-schema-to-grammar.h llguidance.cpp log.cpp log.h ngram-cache.cpp ngram-cache.h ngram-map.cpp ngram-map.h ngram-mod.cpp ngram-mod.h peg-parser.cpp peg-parser.h preset.cpp preset.h regex-partial.cpp regex-partial.h sampling.cpp sampling.h speculative.cpp speculative.h unicode.cpp unicode.h
docs
android imported-into-android-studio.jpg
backend
VirtGPU configuration.md development.md
snapdragon CMakeUserPresets.json README.md developer.md windows.md
BLIS.md CANN.md CUDA-FEDORA.md OPENCL.md SYCL.md VirtGPU.md ZenDNN.md zDNN.md
development
llama-star idea-arch.key idea-arch.pdf
HOWTO-add-model.md debugging-tests.md parsing.md token_generation_performance_tips.md
multimodal MobileVLM.md gemma3.md glmedge.md granitevision.md llava.md minicpmo2.6.md minicpmo4.0.md minicpmv2.5.md minicpmv2.6.md minicpmv4.0.md minicpmv4.5.md
ops BLAS.csv CANN.csv CPU.csv CUDA.csv Metal.csv OpenCL.csv SYCL.csv Vulkan.csv WebGPU.csv ZenDNN.csv zDNN.csv
android.md build-riscv64-spacemit.md build-s390x.md build.md docker.md function-calling.md install.md llguidance.md multimodal.md ops.md preset.md speculative.md
examples
batched CMakeLists.txt README.md batched.cpp
batched.swift
Sources main.swift
.gitignore Makefile Package.swift README.md
convert-llama2c-to-ggml CMakeLists.txt README.md convert-llama2c-to-ggml.cpp
debug CMakeLists.txt README.md debug.cpp
deprecation-warning README.md deprecation-warning.cpp
diffusion CMakeLists.txt README.md diffusion-cli.cpp
embedding CMakeLists.txt README.md embedding.cpp
eval-callback CMakeLists.txt README.md eval-callback.cpp
gen-docs CMakeLists.txt gen-docs.cpp
gguf CMakeLists.txt gguf.cpp
gguf-hash
deps
rotate-bits package.json rotate-bits.h
sha1 package.json sha1.c sha1.h
sha256 package.json sha256.c sha256.h
xxhash clib.json xxhash.c xxhash.h
CMakeLists.txt README.md gguf-hash.cpp
idle CMakeLists.txt README.md idle.cpp
llama.android
app
src
main
java
com
example
llama MainActivity.kt MessageAdapter.kt
res
drawable bg_assistant_message.xml bg_user_message.xml ic_launcher_background.xml ic_launcher_foreground.xml outline_folder_open_24.xml outline_send_24.xml
layout activity_main.xml item_message_assistant.xml item_message_user.xml
mipmap-anydpi ic_launcher.xml ic_launcher_round.xml
mipmap-hdpi ic_launcher.webp ic_launcher_round.webp
mipmap-mdpi ic_launcher.webp ic_launcher_round.webp
mipmap-xhdpi ic_launcher.webp ic_launcher_round.webp
mipmap-xxhdpi ic_launcher.webp ic_launcher_round.webp
mipmap-xxxhdpi ic_launcher.webp ic_launcher_round.webp
values colors.xml strings.xml themes.xml
xml backup_rules.xml data_extraction_rules.xml
AndroidManifest.xml
.gitignore build.gradle.kts proguard-rules.pro
gradle
wrapper gradle-wrapper.jar gradle-wrapper.properties
libs.versions.toml
lib
src
androidTest
java
android
llama
cpp ExampleInstrumentedTest.kt
main
cpp CMakeLists.txt ai_chat.cpp logging.h
java
com
arm
aichat
gguf FileType.kt GgufMetadata.kt GgufMetadataReader.kt
internal
gguf GgufMetadataReaderImpl.kt
InferenceEngineImpl.kt
AiChat.kt InferenceEngine.kt
AndroidManifest.xml
test
java
android
llama
cpp ExampleUnitTest.kt
.gitignore build.gradle.kts consumer-rules.pro proguard-rules.pro
.gitignore build.gradle.kts gradle.properties gradlew settings.gradle.kts
llama.swiftui
llama.cpp.swift LibLlama.swift
llama.swiftui
Assets.xcassets
AppIcon.appiconset Contents.json
Contents.json
Models LlamaState.swift
Resources
models .gitignore
UI ContentView.swift DownloadButton.swift InputButton.swift LoadCustomButton.swift
llama_swiftuiApp.swift
llama.swiftui.xcodeproj
project.xcworkspace contents.xcworkspacedata
project.pbxproj
.gitignore README.md
lookahead CMakeLists.txt README.md lookahead.cpp
lookup CMakeLists.txt README.md lookup-create.cpp lookup-merge.cpp lookup-stats.cpp lookup.cpp
model-conversion
scripts
causal compare-embeddings-logits.sh compare-logits.py convert-model.sh modelcard.template run-casual-gen-embeddings-org.py run-converted-model-embeddings-logits.sh run-converted-model.sh run-org-model.py
embedding compare-embeddings-logits.sh convert-model.sh modelcard.template run-converted-model.sh run-original-model.py
utils __init__.py check-nmse.py common.py compare_tokens.py create-collection-add-model.sh curl-embedding-server.sh hf-add-model-to-collection.py hf-create-collection.py hf-create-model.py hf-upload-gguf-model.py inspect-converted-model.sh inspect-org-model.py perplexity-gen.sh perplexity-run-simple.sh perplexity-run.sh quantize.sh run-embedding-server.sh semantic_check.py tensor-info.py
.gitignore Makefile README.md requirements.txt
parallel CMakeLists.txt README.md parallel.cpp
passkey CMakeLists.txt README.md passkey.cpp
retrieval CMakeLists.txt README.md retrieval.cpp
save-load-state CMakeLists.txt save-load-state.cpp
simple CMakeLists.txt README.md simple.cpp
simple-chat CMakeLists.txt README.md simple-chat.cpp
simple-cmake-pkg .gitignore CMakeLists.txt README.md
speculative CMakeLists.txt README.md speculative.cpp
speculative-simple CMakeLists.txt README.md speculative-simple.cpp
sycl CMakeLists.txt README.md build.sh ls-sycl-device.cpp run-llama2.sh test.sh win-build-sycl.bat win-run-llama2.bat win-test.bat
training CMakeLists.txt README.md finetune.cpp
CMakeLists.txt convert_legacy_llama.py json_schema_pydantic_example.py json_schema_to_grammar.py llama.vim pydantic_models_to_grammar.py pydantic_models_to_grammar_examples.py reason-act.sh regex_to_grammar.py server-llama2-13B.sh server_embd.py ts-type-to-grammar.sh
ggml
cmake GitVars.cmake common.cmake ggml-config.cmake.in
include ggml-alloc.h ggml-backend.h ggml-blas.h ggml-cann.h ggml-cpp.h ggml-cpu.h ggml-cuda.h ggml-hexagon.h ggml-metal.h ggml-opencl.h ggml-opt.h ggml-rpc.h ggml-sycl.h ggml-virtgpu.h ggml-vulkan.h ggml-webgpu.h ggml-zdnn.h ggml-zendnn.h ggml.h gguf.h
src
ggml-blas CMakeLists.txt ggml-blas.cpp
ggml-cann CMakeLists.txt acl_tensor.cpp acl_tensor.h aclnn_ops.cpp aclnn_ops.h common.h ggml-cann.cpp
ggml-cpu
amx amx.cpp amx.h common.h mmq.cpp mmq.h
arch
arm cpu-feats.cpp quants.c repack.cpp
loongarch quants.c
powerpc cpu-feats.cpp quants.c
riscv cpu-feats.cpp quants.c repack.cpp
s390 cpu-feats.cpp quants.c
wasm quants.c
x86 cpu-feats.cpp quants.c repack.cpp
cmake FindSIMD.cmake
kleidiai kernels.cpp kernels.h kleidiai.cpp kleidiai.h
llamafile sgemm-ppc.h sgemm.cpp sgemm.h
spacemit ime.cpp ime.h ime1_kernels.cpp ime_kernels.h
CMakeLists.txt arch-fallback.h binary-ops.cpp binary-ops.h common.h ggml-cpu-impl.h ggml-cpu.c ggml-cpu.cpp hbm.cpp hbm.h ops.cpp ops.h quants.c quants.h repack.cpp repack.h simd-mappings.h traits.cpp traits.h unary-ops.cpp unary-ops.h vec.cpp vec.h
ggml-cuda
template-instances fattn-mma-f16-instance-ncols1_1-ncols2_16.cu fattn-mma-f16-instance-ncols1_1-ncols2_32.cu fattn-mma-f16-instance-ncols1_1-ncols2_8.cu fattn-mma-f16-instance-ncols1_16-ncols2_1.cu fattn-mma-f16-instance-ncols1_16-ncols2_2.cu fattn-mma-f16-instance-ncols1_16-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_16.cu fattn-mma-f16-instance-ncols1_2-ncols2_32.cu fattn-mma-f16-instance-ncols1_2-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_8.cu fattn-mma-f16-instance-ncols1_32-ncols2_1.cu fattn-mma-f16-instance-ncols1_32-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_16.cu fattn-mma-f16-instance-ncols1_4-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_4.cu fattn-mma-f16-instance-ncols1_4-ncols2_8.cu fattn-mma-f16-instance-ncols1_64-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_2.cu fattn-mma-f16-instance-ncols1_8-ncols2_4.cu fattn-mma-f16-instance-ncols1_8-ncols2_8.cu fattn-tile-instance-dkq112-dv112.cu fattn-tile-instance-dkq128-dv128.cu fattn-tile-instance-dkq256-dv256.cu fattn-tile-instance-dkq40-dv40.cu fattn-tile-instance-dkq576-dv512.cu fattn-tile-instance-dkq64-dv64.cu fattn-tile-instance-dkq72-dv72.cu fattn-tile-instance-dkq80-dv80.cu fattn-tile-instance-dkq96-dv96.cu fattn-vec-instance-f16-f16.cu fattn-vec-instance-f16-q4_0.cu fattn-vec-instance-f16-q4_1.cu fattn-vec-instance-f16-q5_0.cu fattn-vec-instance-f16-q5_1.cu fattn-vec-instance-f16-q8_0.cu fattn-vec-instance-q4_0-f16.cu fattn-vec-instance-q4_0-q4_0.cu fattn-vec-instance-q4_0-q4_1.cu fattn-vec-instance-q4_0-q5_0.cu fattn-vec-instance-q4_0-q5_1.cu fattn-vec-instance-q4_0-q8_0.cu fattn-vec-instance-q4_1-f16.cu fattn-vec-instance-q4_1-q4_0.cu fattn-vec-instance-q4_1-q4_1.cu fattn-vec-instance-q4_1-q5_0.cu fattn-vec-instance-q4_1-q5_1.cu fattn-vec-instance-q4_1-q8_0.cu fattn-vec-instance-q5_0-f16.cu fattn-vec-instance-q5_0-q4_0.cu fattn-vec-instance-q5_0-q4_1.cu fattn-vec-instance-q5_0-q5_0.cu fattn-vec-instance-q5_0-q5_1.cu fattn-vec-instance-q5_0-q8_0.cu fattn-vec-instance-q5_1-f16.cu fattn-vec-instance-q5_1-q4_0.cu fattn-vec-instance-q5_1-q4_1.cu fattn-vec-instance-q5_1-q5_0.cu fattn-vec-instance-q5_1-q5_1.cu fattn-vec-instance-q5_1-q8_0.cu fattn-vec-instance-q8_0-f16.cu fattn-vec-instance-q8_0-q4_0.cu fattn-vec-instance-q8_0-q4_1.cu fattn-vec-instance-q8_0-q5_0.cu fattn-vec-instance-q8_0-q5_1.cu fattn-vec-instance-q8_0-q8_0.cu generate_cu_files.py mmf-instance-ncols_1.cu mmf-instance-ncols_10.cu mmf-instance-ncols_11.cu mmf-instance-ncols_12.cu mmf-instance-ncols_13.cu mmf-instance-ncols_14.cu mmf-instance-ncols_15.cu mmf-instance-ncols_16.cu mmf-instance-ncols_2.cu mmf-instance-ncols_3.cu mmf-instance-ncols_4.cu mmf-instance-ncols_5.cu mmf-instance-ncols_6.cu mmf-instance-ncols_7.cu mmf-instance-ncols_8.cu mmf-instance-ncols_9.cu mmq-instance-iq1_s.cu mmq-instance-iq2_s.cu mmq-instance-iq2_xs.cu mmq-instance-iq2_xxs.cu mmq-instance-iq3_s.cu mmq-instance-iq3_xxs.cu mmq-instance-iq4_nl.cu mmq-instance-iq4_xs.cu mmq-instance-mxfp4.cu mmq-instance-q2_k.cu mmq-instance-q3_k.cu mmq-instance-q4_0.cu mmq-instance-q4_1.cu mmq-instance-q4_k.cu mmq-instance-q5_0.cu mmq-instance-q5_1.cu mmq-instance-q5_k.cu mmq-instance-q6_k.cu mmq-instance-q8_0.cu
vendors cuda.h hip.h musa.h
CMakeLists.txt acc.cu acc.cuh add-id.cu add-id.cuh arange.cu arange.cuh argmax.cu argmax.cuh argsort.cu argsort.cuh binbcast.cu binbcast.cuh clamp.cu clamp.cuh common.cuh concat.cu concat.cuh conv-transpose-1d.cu conv-transpose-1d.cuh conv2d-dw.cu conv2d-dw.cuh conv2d-transpose.cu conv2d-transpose.cuh conv2d.cu conv2d.cuh convert.cu convert.cuh count-equal.cu count-equal.cuh cp-async.cuh cpy-utils.cuh cpy.cu cpy.cuh cross-entropy-loss.cu cross-entropy-loss.cuh cumsum.cu cumsum.cuh dequantize.cuh diag.cu diag.cuh diagmask.cu diagmask.cuh fattn-common.cuh fattn-mma-f16.cuh fattn-tile.cu fattn-tile.cuh fattn-vec.cuh fattn-wmma-f16.cu fattn-wmma-f16.cuh fattn.cu fattn.cuh fill.cu fill.cuh getrows.cu getrows.cuh ggml-cuda.cu gla.cu gla.cuh im2col.cu im2col.cuh mean.cu mean.cuh mma.cuh mmf.cu mmf.cuh mmid.cu mmid.cuh mmq.cu mmq.cuh mmvf.cu mmvf.cuh mmvq.cu mmvq.cuh norm.cu norm.cuh opt-step-adamw.cu opt-step-adamw.cuh opt-step-sgd.cu opt-step-sgd.cuh out-prod.cu out-prod.cuh pad.cu pad.cuh pad_reflect_1d.cu pad_reflect_1d.cuh pool2d.cu pool2d.cuh quantize.cu quantize.cuh reduce_rows.cuh roll.cu roll.cuh rope.cu rope.cuh scale.cu scale.cuh set-rows.cu set-rows.cuh set.cu set.cuh softcap.cu softcap.cuh softmax.cu softmax.cuh solve_tri.cu solve_tri.cuh ssm-conv.cu ssm-conv.cuh ssm-scan.cu ssm-scan.cuh sum.cu sum.cuh sumrows.cu sumrows.cuh top-k.cu top-k.cuh topk-moe.cu topk-moe.cuh tri.cu tri.cuh tsembd.cu tsembd.cuh unary.cu unary.cuh upscale.cu upscale.cuh vecdotq.cuh wkv.cu wkv.cuh
ggml-hexagon
htp CMakeLists.txt act-ops.c argsort-ops.c binary-ops.c cmake-toolchain.cmake cpy-ops.c flash-attn-ops.c get-rows-ops.c hex-dma.c hex-dma.h hex-dump.h hex-fastdiv.h hex-utils.h htp-ctx.h htp-msg.h htp-ops.h htp_iface.idl hvx-arith.h hvx-base.h hvx-copy.h hvx-div.h hvx-dump.h hvx-exp.h hvx-floor.h hvx-inverse.h hvx-reduce.h hvx-scale.h hvx-sigmoid.h hvx-sqrt.h hvx-types.h hvx-utils.h main.c matmul-ops.c rope-ops.c set-rows-ops.c softmax-ops.c sum-rows-ops.c unary-ops.c worker-pool.c worker-pool.h
CMakeLists.txt ggml-hexagon.cpp htp-drv.cpp htp-drv.h libdl.h libggml-htp.inf op-desc.h
ggml-hip CMakeLists.txt
ggml-metal CMakeLists.txt ggml-metal-common.cpp ggml-metal-common.h ggml-metal-context.h ggml-metal-context.m ggml-metal-device.cpp ggml-metal-device.h ggml-metal-device.m ggml-metal-impl.h ggml-metal-ops.cpp ggml-metal-ops.h ggml-metal.cpp ggml-metal.metal
ggml-musa CMakeLists.txt mudnn.cu mudnn.cuh
ggml-opencl
kernels add.cl add_id.cl argsort.cl clamp.cl concat.cl conv2d.cl conv2d_f16_f32.cl cpy.cl cvt.cl diag_mask_inf.cl div.cl embed_kernel.py expm1.cl fill.cl flash_attn_f16.cl flash_attn_f32.cl flash_attn_f32_f16.cl gelu.cl gemm_moe_mxfp4_f32.cl gemv_moe_mxfp4_f32.cl gemv_noshuffle.cl gemv_noshuffle_general.cl gemv_noshuffle_general_q8_0_f32.cl get_rows.cl glu.cl group_norm.cl im2col_f16.cl im2col_f32.cl mean.cl mul.cl mul_mat_Ab_Bi_8x4.cl mul_mat_f16_f32.cl mul_mm_f16_f32_kq_kqv.cl mul_mm_f16_f32_l4_lm.cl mul_mm_f32_f32_l4_lm.cl mul_mm_q6_k_f32_l4_lm.cl mul_mm_q8_0_f32_8x4.cl mul_mm_q8_0_f32_l4_lm.cl mul_mv_f16_f16.cl mul_mv_f16_f32.cl mul_mv_f16_f32_1row.cl mul_mv_f16_f32_l4.cl mul_mv_f32_f32.cl mul_mv_id_mxfp4_f32.cl mul_mv_id_mxfp4_f32_flat.cl mul_mv_id_q4_0_f32_8x_flat.cl mul_mv_id_q8_0_f32.cl mul_mv_id_q8_0_f32_flat.cl mul_mv_mxfp4_f32.cl mul_mv_mxfp4_f32_flat.cl mul_mv_q4_0_f32.cl mul_mv_q4_0_f32_1d_16x_flat.cl mul_mv_q4_0_f32_1d_8x_flat.cl mul_mv_q4_0_f32_8x_flat.cl mul_mv_q4_0_f32_v.cl mul_mv_q4_k_f32.cl mul_mv_q6_k_f32.cl mul_mv_q6_k_f32_flat.cl mul_mv_q8_0_f32.cl mul_mv_q8_0_f32_flat.cl norm.cl pad.cl relu.cl repeat.cl rms_norm.cl rope.cl scale.cl set_rows.cl sigmoid.cl silu.cl softmax_4_f16.cl softmax_4_f32.cl softmax_f16.cl softmax_f32.cl softplus.cl solve_tri.cl sqr.cl sqrt.cl ssm_conv.cl sub.cl sum_rows.cl tanh.cl transpose.cl tri.cl tsembd.cl upscale.cl
CMakeLists.txt ggml-opencl.cpp
ggml-rpc CMakeLists.txt ggml-rpc.cpp
ggml-sycl
dpct helper.hpp
CMakeLists.txt add-id.cpp add-id.hpp backend.hpp binbcast.cpp binbcast.hpp common.cpp common.hpp concat.cpp concat.hpp conv.cpp conv.hpp convert.cpp convert.hpp count-equal.cpp count-equal.hpp cpy.cpp cpy.hpp dequantize.hpp dmmv.cpp dmmv.hpp element_wise.cpp element_wise.hpp gemm.hpp getrows.cpp getrows.hpp ggml-sycl.cpp gla.cpp gla.hpp im2col.cpp im2col.hpp mmq.cpp mmq.hpp mmvq.cpp mmvq.hpp norm.cpp norm.hpp outprod.cpp outprod.hpp pad.cpp pad.hpp pad_reflect_1d.cpp pad_reflect_1d.hpp presets.hpp quantize.hpp quants.hpp repeat_back.cpp repeat_back.hpp roll.cpp roll.hpp rope.cpp rope.hpp set.cpp set.hpp set_rows.cpp set_rows.hpp softmax.cpp softmax.hpp ssm_conv.cpp ssm_conv.hpp sycl_hw.cpp sycl_hw.hpp tsembd.cpp tsembd.hpp vecdotq.hpp wkv.cpp wkv.hpp
ggml-virtgpu
backend
shared api_remoting.h apir_backend.gen.h apir_backend.h apir_cs.h apir_cs_ggml.h apir_cs_rpc.h
CMakeLists.txt apir_cs_ggml-rpc-back.cpp backend-convert.h backend-dispatched-backend.cpp backend-dispatched-buffer-type.cpp backend-dispatched-buffer.cpp backend-dispatched-device.cpp backend-dispatched.cpp backend-dispatched.gen.h backend-dispatched.h backend-virgl-apir.h backend.cpp
include apir_hw.h
CMakeLists.txt apir_cs_ggml-rpc-front.cpp ggml-backend-buffer-type.cpp ggml-backend-buffer.cpp ggml-backend-device.cpp ggml-backend-reg.cpp ggml-backend.cpp ggml-remoting.h ggmlremoting_functions.yaml regenerate_remoting.py virtgpu-apir.h virtgpu-forward-backend.cpp virtgpu-forward-buffer-type.cpp virtgpu-forward-buffer.cpp virtgpu-forward-device.cpp virtgpu-forward-impl.h virtgpu-forward.gen.h virtgpu-shm.cpp virtgpu-shm.h virtgpu-utils.cpp virtgpu-utils.h virtgpu.cpp virtgpu.h
ggml-vulkan
cmake host-toolchain.cmake.in
vulkan-shaders
feature-tests bfloat16.comp coopmat.comp coopmat2.comp integer_dot.comp
CMakeLists.txt abs.comp acc.comp add.comp add1.comp add_id.comp arange.comp argmax.comp argsort.comp argsort_large.comp ceil.comp clamp.comp concat.comp contig_copy.comp conv2d_dw.comp conv2d_mm.comp conv_transpose_1d.comp copy.comp copy_from_quant.comp copy_to_quant.comp copy_transpose.comp cos.comp count_equal.comp count_experts.comp cumsum.comp cumsum_multipass1.comp cumsum_multipass2.comp dequant_f32.comp dequant_funcs.glsl dequant_funcs_cm2.glsl dequant_head.glsl dequant_iq1_m.comp dequant_iq1_s.comp dequant_iq2_s.comp dequant_iq2_xs.comp dequant_iq2_xxs.comp dequant_iq3_s.comp dequant_iq3_xxs.comp dequant_iq4_nl.comp dequant_iq4_xs.comp dequant_mxfp4.comp dequant_q2_k.comp dequant_q3_k.comp dequant_q4_0.comp dequant_q4_1.comp dequant_q4_k.comp dequant_q5_0.comp dequant_q5_1.comp dequant_q5_k.comp dequant_q6_k.comp dequant_q8_0.comp diag.comp diag_mask_inf.comp div.comp exp.comp fill.comp flash_attn.comp flash_attn_base.glsl flash_attn_cm1.comp flash_attn_cm2.comp flash_attn_mask_opt.comp flash_attn_split_k_reduce.comp floor.comp geglu.comp geglu_erf.comp geglu_quick.comp gelu.comp gelu_erf.comp gelu_quick.comp generic_binary_head.glsl generic_head.glsl generic_unary_head.glsl get_rows.comp get_rows_quant.comp glu_head.glsl glu_main.glsl group_norm.comp hardsigmoid.comp hardswish.comp im2col.comp im2col_3d.comp l2_norm.comp leaky_relu.comp log.comp mul.comp mul_mat_split_k_reduce.comp mul_mat_vec.comp mul_mat_vec_base.glsl mul_mat_vec_iface.glsl mul_mat_vec_iq1_m.comp mul_mat_vec_iq1_s.comp mul_mat_vec_iq2_s.comp mul_mat_vec_iq2_xs.comp mul_mat_vec_iq2_xxs.comp mul_mat_vec_iq3_s.comp mul_mat_vec_iq3_xxs.comp mul_mat_vec_nc.comp mul_mat_vec_p021.comp mul_mat_vec_q2_k.comp mul_mat_vec_q3_k.comp mul_mat_vec_q4_k.comp mul_mat_vec_q5_k.comp mul_mat_vec_q6_k.comp mul_mat_vecq.comp mul_mat_vecq_funcs.glsl mul_mm.comp mul_mm_cm2.comp mul_mm_funcs.glsl mul_mm_id_funcs.glsl mul_mmq.comp mul_mmq_funcs.glsl mul_mmq_shmem_types.glsl multi_add.comp neg.comp norm.comp opt_step_adamw.comp opt_step_sgd.comp pad.comp pool2d.comp quantize_q8_1.comp reglu.comp relu.comp repeat.comp repeat_back.comp rms_norm.comp rms_norm_back.comp rms_norm_partials.comp roll.comp rope_funcs.glsl rope_head.glsl rope_multi.comp rope_neox.comp rope_norm.comp rope_params.glsl rope_vision.comp round.comp rte.glsl scale.comp sigmoid.comp silu.comp silu_back.comp sin.comp soft_max.comp soft_max_back.comp soft_max_large1.comp soft_max_large2.comp soft_max_large3.comp soft_max_large_common.glsl softplus.comp solve_tri.comp sqrt.comp square.comp ssm_conv.comp ssm_scan.comp step.comp sub.comp sum_rows.comp sum_rows.glsl swiglu.comp swiglu_oai.comp tanh.comp timestep_embedding.comp topk_argsort.comp topk_moe.comp topk_nary_search.comp tri.comp trunc.comp types.glsl upscale.comp utils.glsl vulkan-shaders-gen.cpp wkv6.comp wkv7.comp xielu.comp
CMakeLists.txt ggml-vulkan.cpp
ggml-webgpu
wgsl-shaders argmax.wgsl argsort.wgsl argsort_merge.wgsl binary.wgsl common_decls.tmpl cpy.tmpl.wgsl cumsum.wgsl embed_wgsl.py flash_attn.wgsl get_rows.tmpl.wgsl glu.tmpl.wgsl memset.wgsl mul_mat.tmpl.wgsl mul_mat_decls.tmpl mul_mat_reg_tile.tmpl.wgsl mul_mat_subgroup_matrix.tmpl.wgsl mul_mat_vec.tmpl.wgsl pad.wgsl rms_norm.wgsl rope.tmpl.wgsl scale.tmpl.wgsl set_rows.wgsl soft_max.tmpl.wgsl sum_rows.wgsl unary.wgsl
CMakeLists.txt ggml-webgpu-shader-lib.hpp ggml-webgpu.cpp pre_wgsl.hpp
ggml-zdnn .gitignore CMakeLists.txt common.hpp ggml-zdnn.cpp mmf.cpp mmf.hpp utils.cpp utils.hpp
ggml-zendnn CMakeLists.txt ggml-zendnn.cpp
CMakeLists.txt ggml-alloc.c ggml-backend-dl.cpp ggml-backend-dl.h ggml-backend-impl.h ggml-backend-reg.cpp ggml-backend.cpp ggml-common.h ggml-impl.h ggml-opt.cpp ggml-quants.c ggml-quants.h ggml-threading.cpp ggml-threading.h ggml.c ggml.cpp gguf.cpp
.gitignore CMakeLists.txt
gguf-py
examples reader.py writer.py
gguf
scripts gguf_convert_endian.py gguf_dump.py gguf_editor_gui.py gguf_hash.py gguf_new_metadata.py gguf_set_metadata.py
__init__.py constants.py gguf.py gguf_reader.py gguf_writer.py lazy.py metadata.py py.typed quants.py tensor_mapping.py utility.py vocab.py
tests __init__.py test_metadata.py test_quants.py
LICENSE README.md pyproject.toml
grammars README.md arithmetic.gbnf c.gbnf chess.gbnf english.gbnf japanese.gbnf json.gbnf json_arr.gbnf list.gbnf
include llama-cpp.h llama.h
licenses LICENSE-jsonhpp
media llama0-banner.png llama0-logo.png llama1-banner.png llama1-icon-transparent.png llama1-icon-transparent.svg llama1-icon.png llama1-icon.svg llama1-logo.png llama1-logo.svg matmul.png matmul.svg
models
templates Apertus-8B-Instruct.jinja ByteDance-Seed-OSS.jinja CohereForAI-c4ai-command-r-plus-tool_use.jinja CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja GLM-4.6.jinja Kimi-K2-Instruct.jinja Kimi-K2-Thinking.jinja MiMo-VL.jinja MiniMax-M2.jinja Mistral-Small-3.2-24B-Instruct-2506.jinja NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja NVIDIA-Nemotron-Nano-v2.jinja NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja Qwen-QwQ-32B.jinja Qwen-Qwen2.5-7B-Instruct.jinja Qwen-Qwen3-0.6B.jinja Qwen3-Coder.jinja README.md deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja deepseek-ai-DeepSeek-V3.1.jinja fireworks-ai-llama-3-firefunction-v2.jinja google-gemma-2-2b-it.jinja ibm-granite-granite-3.3-2B-Instruct.jinja llama-cpp-deepseek-r1.jinja llama-cpp-lfm2.jinja llama-cpp-rwkv-world.jinja meetkai-functionary-medium-v3.1.jinja meetkai-functionary-medium-v3.2.jinja meta-llama-Llama-3.1-8B-Instruct.jinja meta-llama-Llama-3.2-3B-Instruct.jinja meta-llama-Llama-3.3-70B-Instruct.jinja microsoft-Phi-3.5-mini-instruct.jinja mistralai-Ministral-3-14B-Reasoning-2512.jinja mistralai-Mistral-Nemo-Instruct-2407.jinja moonshotai-Kimi-K2.jinja openai-gpt-oss-120b.jinja unsloth-Apriel-1.5.jinja unsloth-mistral-Devstral-Small-2507.jinja upstage-Solar-Open-100B.jinja
.editorconfig ggml-vocab-aquila.gguf ggml-vocab-baichuan.gguf ggml-vocab-bert-bge.gguf ggml-vocab-bert-bge.gguf.inp ggml-vocab-bert-bge.gguf.out ggml-vocab-command-r.gguf ggml-vocab-command-r.gguf.inp ggml-vocab-command-r.gguf.out ggml-vocab-deepseek-coder.gguf ggml-vocab-deepseek-coder.gguf.inp ggml-vocab-deepseek-coder.gguf.out ggml-vocab-deepseek-llm.gguf ggml-vocab-deepseek-llm.gguf.inp ggml-vocab-deepseek-llm.gguf.out ggml-vocab-falcon.gguf ggml-vocab-falcon.gguf.inp ggml-vocab-falcon.gguf.out ggml-vocab-gpt-2.gguf ggml-vocab-gpt-2.gguf.inp ggml-vocab-gpt-2.gguf.out ggml-vocab-gpt-neox.gguf ggml-vocab-llama-bpe.gguf ggml-vocab-llama-bpe.gguf.inp ggml-vocab-llama-bpe.gguf.out ggml-vocab-llama-spm.gguf ggml-vocab-llama-spm.gguf.inp ggml-vocab-llama-spm.gguf.out ggml-vocab-mpt.gguf ggml-vocab-mpt.gguf.inp ggml-vocab-mpt.gguf.out ggml-vocab-nomic-bert-moe.gguf ggml-vocab-phi-3.gguf ggml-vocab-phi-3.gguf.inp ggml-vocab-phi-3.gguf.out ggml-vocab-qwen2.gguf ggml-vocab-qwen2.gguf.inp ggml-vocab-qwen2.gguf.out ggml-vocab-refact.gguf ggml-vocab-refact.gguf.inp ggml-vocab-refact.gguf.out ggml-vocab-starcoder.gguf ggml-vocab-starcoder.gguf.inp ggml-vocab-starcoder.gguf.out
pocs
vdot CMakeLists.txt q8dot.cpp vdot.cpp
CMakeLists.txt
requirements requirements-all.txt requirements-compare-llama-bench.txt requirements-convert_hf_to_gguf.txt requirements-convert_hf_to_gguf_update.txt requirements-convert_legacy_llama.txt requirements-convert_llama_ggml_to_gguf.txt requirements-convert_lora_to_gguf.txt requirements-gguf_editor_gui.txt requirements-pydantic.txt requirements-server-bench.txt requirements-test-tokenizer-random.txt requirements-tool_bench.txt
scripts
apple validate-apps.sh validate-ios.sh validate-macos.sh validate-tvos.sh validate-visionos.sh
jinja jinja-tester.py requirements.txt
snapdragon
adb llama-cli.farf run-bench.sh run-cli.sh run-completion.sh run-mtmd.sh run-tool.sh
qdc
tests test_bench.py
readme.md requirements.txt
windows run-bench.ps1 run-cli.ps1 run-tool.ps1 setup-build.ps1
bench-models.sh build-info.sh check-requirements.sh compare-commits.sh compare-llama-bench.py compare-logprobs.py create_ops_docs.py debug-test.sh fetch_server_test_models.py gen-authors.sh gen-unicode-data.py get-flags.mk get-hellaswag.sh get-pg.sh get-wikitext-103.sh get-wikitext-2.sh get-winogrande.sh get_chat_template.py hf.sh install-oneapi.bat pr2wt.sh serve-static.js server-bench.py sync-ggml-am.sh sync-ggml.last sync-ggml.sh sync_vendor.py tool_bench.py tool_bench.sh verify-checksum-models.py xxd.cmake
src
models afmoe.cpp apertus.cpp arcee.cpp arctic.cpp arwkv7.cpp baichuan.cpp bailingmoe.cpp bailingmoe2.cpp bert.cpp bitnet.cpp bloom.cpp chameleon.cpp chatglm.cpp codeshell.cpp cogvlm.cpp cohere2-iswa.cpp command-r.cpp dbrx.cpp deci.cpp deepseek.cpp deepseek2.cpp dots1.cpp dream.cpp ernie4-5-moe.cpp ernie4-5.cpp exaone-moe.cpp exaone.cpp exaone4.cpp falcon-h1.cpp falcon.cpp gemma-embedding.cpp gemma.cpp gemma2-iswa.cpp gemma3.cpp gemma3n-iswa.cpp glm4-moe.cpp glm4.cpp gpt2.cpp gptneox.cpp granite-hybrid.cpp granite.cpp graph-context-mamba.cpp grok.cpp grovemoe.cpp hunyuan-dense.cpp hunyuan-moe.cpp internlm2.cpp jais.cpp jamba.cpp kimi-linear.cpp lfm2.cpp llada-moe.cpp llada.cpp llama-iswa.cpp llama.cpp maincoder.cpp mamba.cpp mimo2-iswa.cpp minicpm3.cpp minimax-m2.cpp mistral3.cpp models.h modern-bert.cpp mpt.cpp nemotron-h.cpp nemotron.cpp neo-bert.cpp olmo.cpp olmo2.cpp olmoe.cpp openai-moe-iswa.cpp openelm.cpp orion.cpp pangu-embedded.cpp phi2.cpp phi3.cpp plamo.cpp plamo2.cpp plamo3.cpp plm.cpp qwen.cpp qwen2.cpp qwen2moe.cpp qwen2vl.cpp qwen3.cpp qwen35.cpp qwen35moe.cpp qwen3moe.cpp qwen3next.cpp qwen3vl-moe.cpp qwen3vl.cpp refact.cpp rnd1.cpp rwkv6-base.cpp rwkv6.cpp rwkv6qwen2.cpp rwkv7-base.cpp rwkv7.cpp seed-oss.cpp smallthinker.cpp smollm3.cpp stablelm.cpp starcoder.cpp starcoder2.cpp step35-iswa.cpp t5-dec.cpp t5-enc.cpp wavtokenizer-dec.cpp xverse.cpp
CMakeLists.txt llama-adapter.cpp llama-adapter.h llama-arch.cpp llama-arch.h llama-batch.cpp llama-batch.h llama-chat.cpp llama-chat.h llama-context.cpp llama-context.h llama-cparams.cpp llama-cparams.h llama-grammar.cpp llama-grammar.h llama-graph.cpp llama-graph.h llama-hparams.cpp llama-hparams.h llama-impl.cpp llama-impl.h llama-io.cpp llama-io.h llama-kv-cache-iswa.cpp llama-kv-cache-iswa.h llama-kv-cache.cpp llama-kv-cache.h llama-kv-cells.h llama-memory-hybrid-iswa.cpp llama-memory-hybrid-iswa.h llama-memory-hybrid.cpp llama-memory-hybrid.h llama-memory-recurrent.cpp llama-memory-recurrent.h llama-memory.cpp llama-memory.h llama-mmap.cpp llama-mmap.h llama-model-loader.cpp llama-model-loader.h llama-model-saver.cpp llama-model-saver.h llama-model.cpp llama-model.h llama-quant.cpp llama-quant.h llama-sampler.cpp llama-sampler.h llama-vocab.cpp llama-vocab.h llama.cpp unicode-data.cpp unicode-data.h unicode.cpp unicode.h
tests
peg-parser simple-tokenize.cpp simple-tokenize.h test-basic.cpp test-gbnf-generation.cpp test-json-parser.cpp test-json-serialization.cpp test-unicode.cpp tests.h
.gitignore CMakeLists.txt get-model.cpp get-model.h run-json-schema-to-grammar.mjs test-alloc.cpp test-arg-parser.cpp test-autorelease.cpp test-backend-ops.cpp test-backend-sampler.cpp test-barrier.cpp test-c.c test-chat-parser.cpp test-chat-peg-parser.cpp test-chat-template.cpp test-chat.cpp test-double-float.cpp test-gbnf-validator.cpp test-gguf.cpp test-grammar-integration.cpp test-grammar-llguidance.cpp test-grammar-parser.cpp test-jinja.cpp test-json-partial.cpp test-json-schema-to-grammar.cpp test-llama-grammar.cpp test-log.cpp test-lora-conversion-inference.sh test-model-load-cancel.cpp test-mtmd-c-api.c test-opt.cpp test-peg-parser.cpp test-quantize-fns.cpp test-quantize-perf.cpp test-quantize-stats.cpp test-regex-partial.cpp test-rope.cpp test-sampling.cpp test-state-restore-fragmented.cpp test-thread-safety.cpp test-tokenizer-0.cpp test-tokenizer-0.py test-tokenizer-0.sh test-tokenizer-1-bpe.cpp test-tokenizer-1-spm.cpp test-tokenizer-random.py test-tokenizers-repo.sh testing.h
tools
batched-bench CMakeLists.txt README.md batched-bench.cpp
cli CMakeLists.txt README.md cli.cpp
completion CMakeLists.txt README.md completion.cpp
cvector-generator CMakeLists.txt README.md completions.txt cvector-generator.cpp mean.hpp negative.txt pca.hpp positive.txt
export-lora CMakeLists.txt README.md export-lora.cpp
fit-params CMakeLists.txt README.md fit-params.cpp
gguf-split CMakeLists.txt README.md gguf-split.cpp tests.sh
imatrix CMakeLists.txt README.md imatrix.cpp
llama-bench CMakeLists.txt README.md llama-bench.cpp
mtmd
legacy-models convert_image_encoder_to_gguf.py glmedge-convert-image-encoder-to-gguf.py glmedge-surgery.py llava_surgery.py llava_surgery_v2.py minicpmv-convert-image-encoder-to-gguf.py minicpmv-surgery.py
models cogvlm.cpp conformer.cpp glm4v.cpp internvl.cpp kimik25.cpp kimivl.cpp llama4.cpp llava.cpp minicpmv.cpp mobilenetv5.cpp models.h pixtral.cpp qwen2vl.cpp qwen3vl.cpp siglip.cpp whisper-enc.cpp youtuvl.cpp
CMakeLists.txt README.md clip-graph.h clip-impl.h clip-model.h clip.cpp clip.h deprecation-warning.cpp mtmd-audio.cpp mtmd-audio.h mtmd-cli.cpp mtmd-helper.cpp mtmd-helper.h mtmd.cpp mtmd.h requirements.txt test-1.jpeg test-2.mp3 tests.sh
perplexity CMakeLists.txt README.md perplexity.cpp
quantize CMakeLists.txt README.md quantize.cpp tests.sh
rpc CMakeLists.txt README.md rpc-server.cpp
server
bench README.md bench.py prometheus.yml requirements.txt script.js
public index.html.gz loading.html
public_legacy colorthemes.css completion.js favicon.ico index-new.html index.html index.js json-schema-to-grammar.mjs loading.html prompt-formats.js style.css system-prompts.js theme-beeninorder.css theme-ketivah.css theme-mangotango.css theme-playground.css theme-polarnight.css theme-snowstorm.css
public_simplechat datautils.mjs index.html readme.md simplechat.css simplechat.js simplechat_screens.webp ui.mjs
tests
unit test_basic.py test_chat_completion.py test_compat_anthropic.py test_compat_oai_responses.py test_completion.py test_ctx_shift.py test_embedding.py test_infill.py test_lora.py test_rerank.py test_router.py test_security.py test_sleep.py test_slot_save.py test_speculative.py test_template.py test_tokenize.py test_tool_call.py test_vision_api.py
.gitignore README.md conftest.py pytest.ini requirements.txt tests.sh utils.py
themes
buttons-top README.md buttons_top.png favicon.ico index.html
wild README.md favicon.ico index.html llama_cpp.png llamapattern.png wild.png
README.md
webui
.storybook ModeWatcherDecorator.svelte TooltipProviderDecorator.svelte main.ts preview.ts vitest.setup.ts
docs
architecture high-level-architecture-simplified.md high-level-architecture.md
flows chat-flow.md conversations-flow.md data-flow-simplified-model-mode.md data-flow-simplified-router-mode.md database-flow.md models-flow.md server-flow.md settings-flow.md
scripts dev.sh install-git-hooks.sh post-build.sh
src
lib
components
app
chat
ChatAttachments ChatAttachmentPreview.svelte ChatAttachmentThumbnailFile.svelte ChatAttachmentThumbnailImage.svelte ChatAttachmentsList.svelte ChatAttachmentsViewAll.svelte
ChatForm
ChatFormActions ChatFormActionFileAttachments.svelte ChatFormActionRecord.svelte ChatFormActionSubmit.svelte ChatFormActions.svelte
ChatForm.svelte ChatFormFileInputInvisible.svelte ChatFormHelperText.svelte ChatFormTextarea.svelte
ChatMessages ChatMessage.svelte ChatMessageActions.svelte ChatMessageAssistant.svelte ChatMessageBranchingControls.svelte ChatMessageEditForm.svelte ChatMessageStatistics.svelte ChatMessageSystem.svelte ChatMessageThinkingBlock.svelte ChatMessageUser.svelte ChatMessages.svelte
ChatScreen ChatScreen.svelte ChatScreenDragOverlay.svelte ChatScreenHeader.svelte ChatScreenProcessingInfo.svelte
ChatSettings ChatSettings.svelte ChatSettingsFields.svelte ChatSettingsFooter.svelte ChatSettingsImportExportTab.svelte ChatSettingsParameterSourceIndicator.svelte
ChatSidebar ChatSidebar.svelte ChatSidebarActions.svelte ChatSidebarConversationItem.svelte ChatSidebarSearch.svelte handle-mobile-sidebar-item-click.ts
dialogs DialogChatAttachmentPreview.svelte DialogChatAttachmentsViewAll.svelte DialogChatError.svelte DialogChatSettings.svelte DialogConfirmation.svelte DialogConversationSelection.svelte DialogConversationTitleUpdate.svelte DialogEmptyFileAlert.svelte DialogModelInformation.svelte DialogModelNotAvailable.svelte
misc ActionButton.svelte ActionDropdown.svelte BadgeChatStatistic.svelte BadgeInfo.svelte BadgeModality.svelte CodePreviewDialog.svelte ConversationSelection.svelte CopyToClipboardIcon.svelte KeyboardShortcutInfo.svelte MarkdownContent.svelte RemoveButton.svelte SearchInput.svelte SyntaxHighlightedCode.svelte
models ModelBadge.svelte ModelsSelector.svelte
server ServerErrorSplash.svelte ServerLoadingSplash.svelte ServerStatus.svelte
index.ts
ui
alert alert-description.svelte alert-title.svelte alert.svelte index.ts
alert-dialog alert-dialog-action.svelte alert-dialog-cancel.svelte alert-dialog-content.svelte alert-dialog-description.svelte alert-dialog-footer.svelte alert-dialog-header.svelte alert-dialog-overlay.svelte alert-dialog-title.svelte alert-dialog-trigger.svelte index.ts
badge badge.svelte index.ts
button button.svelte index.ts
card card-action.svelte card-content.svelte card-description.svelte card-footer.svelte card-header.svelte card-title.svelte card.svelte index.ts
checkbox checkbox.svelte index.ts
collapsible collapsible-content.svelte collapsible-trigger.svelte collapsible.svelte index.ts
dialog dialog-close.svelte dialog-content.svelte dialog-description.svelte dialog-footer.svelte dialog-header.svelte dialog-overlay.svelte dialog-title.svelte dialog-trigger.svelte index.ts
dropdown-menu dropdown-menu-checkbox-item.svelte dropdown-menu-content.svelte dropdown-menu-group-heading.svelte dropdown-menu-group.svelte dropdown-menu-item.svelte dropdown-menu-label.svelte dropdown-menu-radio-group.svelte dropdown-menu-radio-item.svelte dropdown-menu-separator.svelte dropdown-menu-shortcut.svelte dropdown-menu-sub-content.svelte dropdown-menu-sub-trigger.svelte dropdown-menu-trigger.svelte index.ts
input index.ts input.svelte
label index.ts label.svelte
popover index.ts popover-close.svelte popover-content.svelte popover-portal.svelte popover-trigger.svelte popover.svelte
scroll-area index.ts scroll-area-scrollbar.svelte scroll-area.svelte
select index.ts select-content.svelte select-group-heading.svelte select-group.svelte select-item.svelte select-label.svelte select-scroll-down-button.svelte select-scroll-up-button.svelte select-separator.svelte select-trigger.svelte
separator index.ts separator.svelte
sheet index.ts sheet-close.svelte sheet-content.svelte sheet-description.svelte sheet-footer.svelte sheet-header.svelte sheet-overlay.svelte sheet-title.svelte sheet-trigger.svelte
sidebar constants.ts context.svelte.ts index.ts sidebar-content.svelte sidebar-footer.svelte sidebar-group-action.svelte sidebar-group-content.svelte sidebar-group-label.svelte sidebar-group.svelte sidebar-header.svelte sidebar-input.svelte sidebar-inset.svelte sidebar-menu-action.svelte sidebar-menu-badge.svelte sidebar-menu-button.svelte sidebar-menu-item.svelte sidebar-menu-skeleton.svelte sidebar-menu-sub-button.svelte sidebar-menu-sub-item.svelte sidebar-menu-sub.svelte sidebar-menu.svelte sidebar-provider.svelte sidebar-rail.svelte sidebar-separator.svelte sidebar-trigger.svelte sidebar.svelte
skeleton index.ts skeleton.svelte
switch index.ts switch.svelte
table index.ts table-body.svelte table-caption.svelte table-cell.svelte table-footer.svelte table-head.svelte table-header.svelte table-row.svelte table.svelte
textarea index.ts textarea.svelte
tooltip index.ts tooltip-content.svelte tooltip-trigger.svelte
utils.ts
constants auto-scroll.ts binary-detection.ts default-context.ts floating-ui-constraints.ts icons.ts input-classes.ts latex-protection.ts literal-html.ts localstorage-keys.ts max-bundle-size.ts precision.ts processing-info.ts settings-config.ts supported-file-types.ts table-html-restorer.ts tooltip-config.ts viewport.ts
enums attachment.ts chat.ts files.ts index.ts model.ts server.ts
hooks is-mobile.svelte.ts use-model-change-validation.svelte.ts use-processing-state.svelte.ts
markdown enhance-code-blocks.ts enhance-links.ts literal-html.ts table-html-restorer.ts
services chat.ts database.ts index.ts models.ts parameter-sync.spec.ts parameter-sync.ts props.ts
stores chat.svelte.ts conversations.svelte.ts models.svelte.ts persisted.svelte.ts server.svelte.ts settings.svelte.ts
types api.d.ts chat.d.ts database.d.ts index.ts models.d.ts settings.d.ts
utils api-headers.ts api-key-validation.ts attachment-display.ts attachment-type.ts audio-recording.ts autoresize-textarea.ts branching.ts browser-only.ts clipboard.ts config-helpers.ts conversation-utils.ts convert-files-to-extra.ts file-preview.ts file-type.ts formatters.ts index.ts is-ime-composing.ts latex-protection.ts modality-file-validation.ts model-names.ts pdf-processing.ts portal-to-body.ts precision.ts process-uploaded-files.ts svg-to-png.ts syntax-highlight-language.ts text-files.ts text.ts webp-to-png.ts
routes
chat
[id] +page.svelte +page.ts
+error.svelte +layout.svelte +page.svelte +page.ts
styles katex-custom.scss
app.css app.d.ts app.html
static favicon.svg loading.html
tests
client
components TestWrapper.svelte
page.svelte.test.ts
e2e demo.test.ts
stories
fixtures
assets 1.jpg beautiful-flowers-lotus.webp example.pdf hf-logo.svg
ai-tutorial.ts api-docs.ts blog-post.ts data-analysis.ts empty.ts math-formulas.ts readme.ts storybook-mocks.ts
ChatForm.stories.svelte ChatMessage.stories.svelte ChatSettings.stories.svelte ChatSidebar.stories.svelte Introduction.mdx MarkdownContent.stories.svelte
unit clipboard.test.ts latex-protection.test.ts model-names.test.ts
.gitignore .npmrc .prettierignore .prettierrc README.md components.json eslint.config.js package-lock.json package.json playwright.config.ts svelte.config.js tsconfig.json vite.config.ts vitest-setup-client.ts
CMakeLists.txt README-dev.md README.md chat-llama2.sh chat.mjs chat.sh server-common.cpp server-common.h server-context.cpp server-context.h server-http.cpp server-http.h server-models.cpp server-models.h server-queue.cpp server-queue.h server-task.cpp server-task.h server.cpp
tokenize CMakeLists.txt tokenize.cpp
tts CMakeLists.txt README.md convert_pt_to_hf.py tts-outetts.py tts.cpp
CMakeLists.txt
vendor
cpp-httplib CMakeLists.txt LICENSE httplib.cpp httplib.h
miniaudio miniaudio.h
nlohmann json.hpp json_fwd.hpp
sheredom subprocess.h
stb stb_image.h
.clang-format .clang-tidy .dockerignore .ecrc .editorconfig .flake8 .gitignore .gitmodules .pre-commit-config.yaml AGENTS.md AUTHORS CLAUDE.md CMakeLists.txt CMakePresets.json CODEOWNERS CONTRIBUTING.md LICENSE Makefile README.md SECURITY.md convert_hf_to_gguf.py convert_hf_to_gguf_update.py convert_llama_ggml_to_gguf.py convert_lora_to_gguf.py flake.lock flake.nix mypy.ini poetry.lock pyproject.toml pyrightconfig.json requirements.txt
maps map1.h map1.txt
papers 2310.11703v2.pdf 2405.14159v2.pdf
prompts lotr.h lotr.txt
.gitignore Dockerfile Makefile README.md compile_flags.txt context.c game.c makext.mk mapeditor.html maps.h minunit.h models.h models.txt nonstd.h npc.c termbox2.h vectordb.c vectordb.h
llama.cpp/convert_hf_to_gguf.py raw
    1#!/usr/bin/env python3
    2# -*- coding: utf-8 -*-
    3
    4from __future__ import annotations
    5
    6import ast
    7import logging
    8import argparse
    9import contextlib
   10import json
   11import os
   12import re
   13import sys
   14from enum import IntEnum
   15from pathlib import Path
   16from hashlib import sha256
   17from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
   18from itertools import chain
   19from transformers import AutoConfig
   20
   21import math
   22import numpy as np
   23import torch
   24
   25if TYPE_CHECKING:
   26    from torch import Tensor
   27
   28if 'NO_LOCAL_GGUF' not in os.environ:
   29    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
   30import gguf
   31from gguf.vocab import MistralTokenizerType, MistralVocab
   32
   33try:
   34    from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
   35    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
   36    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
   37    from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
   38        SentencePieceTokenizer,
   39    )
   40
   41    _mistral_common_installed = True
   42    _mistral_import_error_msg = ""
   43except ImportError:
   44    _MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
   45    _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
   46
   47    _mistral_common_installed = False
   48    TokenizerVersion = None
   49    Tekkenizer = None
   50    SentencePieceTokenizer = None
   51    _mistral_import_error_msg = (
   52        "Mistral format requires `mistral-common` to be installed. Please run "
   53        "`pip install mistral-common[image,audio]` to install it."
   54    )
   55
   56
   57logger = logging.getLogger("hf-to-gguf")
   58
   59
   60###### MODEL DEFINITIONS ######
   61
   62class SentencePieceTokenTypes(IntEnum):
   63    NORMAL = 1
   64    UNKNOWN = 2
   65    CONTROL = 3
   66    USER_DEFINED = 4
   67    UNUSED = 5
   68    BYTE = 6
   69
   70
   71class ModelType(IntEnum):
   72    TEXT = 1
   73    MMPROJ = 2
   74
   75
   76AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
   77
   78
   79class ModelBase:
   80    _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
   81        ModelType.TEXT: {},
   82        ModelType.MMPROJ: {},
   83    }
   84
   85    dir_model: Path
   86    ftype: gguf.LlamaFileType
   87    fname_out: Path
   88    is_big_endian: bool
   89    endianess: gguf.GGUFEndian
   90    use_temp_file: bool
   91    lazy: bool
   92    dry_run: bool
   93    hparams: dict[str, Any]
   94    model_tensors: dict[str, Callable[[], Tensor]]
   95    gguf_writer: gguf.GGUFWriter
   96    model_name: str | None
   97    metadata_override: Path | None
   98    dir_model_card: Path
   99    remote_hf_model_id: str | None
  100
  101    # subclasses should define this!
  102    model_arch: gguf.MODEL_ARCH
  103
  104    # subclasses should initialize this!
  105    block_count: int
  106    tensor_map: gguf.TensorNameMap
  107
  108    # Mistral format specifics
  109    is_mistral_format: bool = False
  110    disable_mistral_community_chat_template: bool = False
  111    sentence_transformers_dense_modules: bool = False
  112
  113    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
  114                 use_temp_file: bool = False, eager: bool = False,
  115                 metadata_override: Path | None = None, model_name: str | None = None,
  116                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
  117                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
  118                 disable_mistral_community_chat_template: bool = False,
  119                 sentence_transformers_dense_modules: bool = False):
  120        if type(self) is ModelBase or \
  121                type(self) is TextModel or \
  122                type(self) is MmprojModel:
  123            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
  124
  125        if self.is_mistral_format and not _mistral_common_installed:
  126            raise ImportError(_mistral_import_error_msg)
  127
  128        self.dir_model = dir_model
  129        self.ftype = ftype
  130        self.fname_out = fname_out
  131        self.is_big_endian = is_big_endian
  132        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
  133        self.use_temp_file = use_temp_file
  134        self.lazy = not eager or (remote_hf_model_id is not None)
  135        self.dry_run = dry_run
  136        self.remote_hf_model_id = remote_hf_model_id
  137        self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
  138        self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
  139        self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
  140        self.metadata_override = metadata_override
  141        self.model_name = model_name
  142        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
  143
  144        # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
  145        # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
  146        if self.ftype == gguf.LlamaFileType.GUESSED:
  147            for _, tensor in self.get_tensors():
  148                if tensor.dim() < 2:
  149                    continue
  150
  151                if tensor.dtype == torch.bfloat16:
  152                    self.ftype = gguf.LlamaFileType.MOSTLY_BF16
  153                    logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16")
  154                    break
  155                elif tensor.dtype == torch.float16:
  156                    self.ftype = gguf.LlamaFileType.MOSTLY_F16
  157                    logger.info("heuristics detected float16 tensor dtype, setting --outtype f16")
  158                    break
  159            else:
  160                self.ftype = gguf.LlamaFileType.MOSTLY_F16
  161                logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")
  162
  163        # Configure GGUF Writer
  164        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
  165                                           split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
  166
  167        # Mistral specific
  168        self.disable_mistral_community_chat_template = disable_mistral_community_chat_template
  169
  170    @classmethod
  171    def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path:
  172        stem, suffix = path.stem, path.suffix
  173        new_name = f"{prefix}{stem}{suffix}"
  174        return path.with_name(new_name)
  175
  176    def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
  177        key = next((k for k in keys if k in self.hparams), None)
  178        if key is not None:
  179            return self.hparams[key]
  180        if optional:
  181            return None
  182        raise KeyError(f"could not find any of: {keys}")
  183
  184    def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
  185        tensors: dict[str, Callable[[], Tensor]] = {}
  186
  187        if remote_hf_model_id is not None:
  188            is_safetensors = True
  189
  190            logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
  191            remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
  192            for name, remote_tensor in remote_tensors.items():
  193                tensors[name] = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r)
  194
  195            return tensors
  196
  197        prefix = "model" if not self.is_mistral_format else "consolidated"
  198        part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
  199        is_safetensors: bool = len(part_names) > 0
  200        if not is_safetensors:
  201            part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
  202
  203        tensor_names_from_index: set[str] = set()
  204
  205        if not self.is_mistral_format:
  206            index_name = "model.safetensors" if is_safetensors else "pytorch_model.bin"
  207            index_name += ".index.json"
  208            index_file = self.dir_model / index_name
  209
  210            if index_file.is_file():
  211                logger.info(f"gguf: loading model weight map from '{index_name}'")
  212                with open(index_file, "r", encoding="utf-8") as f:
  213                    index: dict[str, Any] = json.load(f)
  214                    weight_map = index.get("weight_map")
  215                    if weight_map is None or not isinstance(weight_map, dict):
  216                        raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
  217                    tensor_names_from_index.update(weight_map.keys())
  218                    part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None)
  219                    part_names = sorted(part_dict.keys())
  220            else:
  221                weight_map = {}
  222        else:
  223            weight_map = {}
  224
  225        for part_name in part_names:
  226            logger.info(f"gguf: indexing model part '{part_name}'")
  227            ctx: ContextManager[Any]
  228            if is_safetensors:
  229                ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name))
  230            else:
  231                ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
  232
  233            with ctx as model_part:
  234                assert model_part is not None
  235
  236                for name in model_part.keys():
  237                    if is_safetensors:
  238                        data: gguf.utility.LocalTensor = model_part[name]
  239                        if self.lazy:
  240                            data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data)  # noqa: E731
  241                        else:
  242                            dtype = LazyTorchTensor._dtype_str_map[data.dtype]
  243                            data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape)  # noqa: E731
  244                    else:
  245                        data_torch: Tensor = model_part[name]
  246                        if self.lazy:
  247                            data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data)  # noqa: E731
  248                        else:
  249                            data_gen = lambda data=data_torch: data  # noqa: E731
  250                    tensors[name] = data_gen
  251
  252        # verify tensor name presence and identify potentially missing files
  253        if len(tensor_names_from_index) > 0:
  254            tensor_names_from_parts = set(tensors.keys())
  255            if len(tensor_names_from_parts.symmetric_difference(tensor_names_from_index)) > 0:
  256                missing = sorted(tensor_names_from_index.difference(tensor_names_from_parts))
  257                extra = sorted(tensor_names_from_parts.difference(tensor_names_from_index))
  258                missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
  259                if len(extra) == 0 and len(missing_files) > 0:
  260                    raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
  261                                     f"Missing tensors: {missing}")
  262                else:
  263                    raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
  264                                     f"Missing tensors: {missing}\n"
  265                                     f"Extra tensors: {extra}")
  266
  267        return tensors
  268
  269    def dequant_model(self):
  270        tensors_to_remove: list[str] = []
  271        new_tensors: dict[str, Callable[[], Tensor]] = {}
  272
  273        if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict):
  274            quant_method = quant_config.get("quant_method")
  275
  276            def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor:
  277                weight = weight.view(torch.uint8)
  278                orig_shape = weight.shape
  279
  280                shift = torch.tensor([0, 2, 4, 6], dtype=torch.uint8).reshape((4, *(1 for _ in range(len(orig_shape)))))
  281                data = weight.unsqueeze(0).expand((4, *orig_shape)) >> shift
  282                data = data & 3
  283                data = (data.float() - 1).reshape((orig_shape[0] * 4, *orig_shape[1:]))
  284
  285                # The scale is inverted
  286                return data / scale.float()
  287
  288            def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor:
  289                scale = scale.float()
  290
  291                if block_size is not None:
  292                    for i, size in enumerate(block_size):
  293                        scale = scale.repeat_interleave(size, i)
  294                    # unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
  295                    scale = scale[tuple(slice(0, size) for size in weight.shape)]
  296
  297                return weight.float() * scale
  298
  299            # ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476
  300            def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) -> Tensor:
  301                bits = quant_config["bits"]
  302                assert bits in (2, 3, 4, 8)
  303                assert qweight.dtype == qzeros.dtype
  304                maxq = (2 ** bits) - 1
  305                weight = None
  306                zeros = None
  307                pack_dtype_bits = qweight.dtype.itemsize * 8
  308
  309                if bits in [2, 4, 8]:
  310                    pack_factor = pack_dtype_bits // bits
  311                    wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0)
  312                    if self.lazy:
  313                        wf = LazyTorchTensor.from_eager(wf)
  314
  315                    zeros = torch.bitwise_right_shift(
  316                        qzeros.unsqueeze(2).expand(-1, -1, pack_factor),
  317                        wf.unsqueeze(0)
  318                    ).to(torch.int16 if bits == 8 else torch.int8)
  319                    zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape)
  320
  321                    weight = torch.bitwise_and(
  322                        torch.bitwise_right_shift(
  323                            qweight.unsqueeze(1).expand(-1, pack_factor, -1),
  324                            wf.unsqueeze(-1)
  325                        ).to(torch.int16 if bits == 8 else torch.int8),
  326                        maxq
  327                    )
  328                elif bits == 3:
  329                    raise NotImplementedError("3-bit gptq dequantization is not yet implemented")
  330
  331                assert weight is not None
  332                assert zeros is not None
  333
  334                weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
  335
  336                # gptq_v2 doesn't need to offset zeros
  337                if quant_config.get("checkpoint_format", "gptq") == "gptq":
  338                    zeros += 1
  339
  340                return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T
  341
  342            def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int):
  343                assert w.dtype == torch.int32
  344                shape = tuple(shape_tensor.tolist())
  345                assert len(shape) == 2
  346                mask = (1 << num_bits) - 1
  347
  348                shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32)
  349                if self.lazy:
  350                    shifts = LazyTorchTensor.from_eager(shifts)
  351
  352                if zero_point is None:
  353                    offset = 1 << (num_bits - 1)
  354                else:
  355                    assert len(zero_point.shape) == 2
  356                    offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask
  357                    offset = offset.reshape(-1, zero_point.shape[1])
  358                    # trim padding, and prepare for broadcast
  359                    # NOTE: the zero-point is packed along dim 0
  360                    offset = offset[:shape[0], :].unsqueeze(-1)
  361
  362                # extract values
  363                # NOTE: the weights are packed along dim 1
  364                unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask
  365                unpacked = unpacked.reshape(shape[0], -1)
  366
  367                # trim padding
  368                unpacked = unpacked[:, :shape[1]]
  369
  370                # prepare for broadcast of the scale
  371                unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size)
  372                unpacked = unpacked - offset
  373
  374                return (unpacked * scale.unsqueeze(-1).float()).reshape(shape)
  375
  376            if quant_method == "bitnet":
  377                for name in self.model_tensors.keys():
  378                    if name.endswith(".weight_scale"):
  379                        weight_name = name.removesuffix("_scale")
  380                        w = self.model_tensors[weight_name]
  381                        s = self.model_tensors[name]
  382                        self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s())
  383                        tensors_to_remove.append(name)
  384            elif quant_method == "fp8":
  385                block_size = quant_config.get("weight_block_size")
  386                for name in self.model_tensors.keys():
  387                    if name.endswith(".weight_scale_inv"):
  388                        weight_name = name.removesuffix("_scale_inv")
  389                        w = self.model_tensors[weight_name]
  390                        s = self.model_tensors[name]
  391                        self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
  392                        tensors_to_remove.append(name)
  393                    if name.endswith(".activation_scale"):  # unused
  394                        tensors_to_remove.append(name)
  395                    # mistral format
  396                    if name.endswith(".qscale_weight"):
  397                        weight_name = name.removesuffix("qscale_weight") + "weight"
  398                        w = self.model_tensors[weight_name]
  399                        s = self.model_tensors[name]
  400                        self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
  401                        tensors_to_remove.append(name)
  402                    if name.endswith(".qscale_act"):
  403                        tensors_to_remove.append(name)
  404            elif quant_method == "gptq":
  405                for name in self.model_tensors.keys():
  406                    if name.endswith(".qweight"):
  407                        base_name = name.removesuffix(".qweight")
  408                        g_idx = self.model_tensors[base_name + ".g_idx"]
  409                        qweight = self.model_tensors[base_name + ".qweight"]
  410                        qzeros = self.model_tensors[base_name + ".qzeros"]
  411                        scales = self.model_tensors[base_name + ".scales"]
  412                        new_tensors[base_name + ".weight"] = (
  413                            lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq(
  414                                g(), w(), z(), s()
  415                            )
  416                        )
  417                        tensors_to_remove += [
  418                            base_name + n
  419                            for n in (
  420                                ".g_idx",
  421                                ".qzeros",
  422                                ".qweight",
  423                                ".scales",
  424                            )
  425                        ]
  426            elif quant_method == "compressed-tensors":
  427                quant_format = quant_config["format"]
  428                groups = quant_config["config_groups"]
  429                if len(groups) > 1:
  430                    raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
  431                weight_config = tuple(groups.values())[0]["weights"]
  432
  433                if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized":
  434                    block_size = weight_config.get("block_structure", None)
  435                    strategy = weight_config.get("strategy")
  436                    assert strategy == "channel" or strategy == "block"
  437                    assert weight_config.get("group_size") is None  # didn't find a model using this yet
  438                    for name in self.model_tensors.keys():
  439                        if name.endswith(".weight_scale"):
  440                            weight_name = name.removesuffix("_scale")
  441                            w = self.model_tensors[weight_name]
  442                            s = self.model_tensors[name]
  443                            self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size)
  444                            tensors_to_remove.append(name)
  445                elif quant_format == "pack-quantized":
  446                    assert weight_config.get("strategy") == "group"
  447                    assert weight_config.get("type", "int") == "int"
  448                    num_bits = weight_config.get("num_bits")
  449                    group_size = weight_config.get("group_size")
  450                    assert isinstance(num_bits, int)
  451                    assert isinstance(group_size, int)
  452                    for name in self.model_tensors.keys():
  453                        if name.endswith(".weight_packed"):
  454                            base_name = name.removesuffix("_packed")
  455                            w = self.model_tensors[name]
  456                            scale = self.model_tensors[base_name + "_scale"]
  457                            shape = self.model_tensors[base_name + "_shape"]
  458                            zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None)
  459                            new_tensors[base_name] = (
  460                                lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed(
  461                                    w(), scale(), shape(), zero_point(), num_bits, group_size,
  462                                )
  463                            )
  464                            tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
  465                            if (base_name + "_zero_point") in self.model_tensors:
  466                                tensors_to_remove.append(base_name + "_zero_point")
  467                else:
  468                    raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
  469            else:
  470                raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
  471
  472        for name in tensors_to_remove:
  473            if name in self.model_tensors:
  474                del self.model_tensors[name]
  475
  476        for name, value in new_tensors.items():
  477            self.model_tensors[name] = value
  478
  479    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
  480        for name, gen in self.model_tensors.items():
  481            yield name, gen()
  482
  483    def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
  484        if key not in gguf.MODEL_TENSORS[self.model_arch]:
  485            raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
  486        name: str = gguf.TENSOR_NAMES[key]
  487        if "{bid}" in name:
  488            assert bid is not None
  489            name = name.format(bid=bid)
  490        return name + suffix
  491
  492    def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
  493        if key not in gguf.MODEL_TENSORS[self.model_arch]:
  494            return False
  495        key_name: str = gguf.TENSOR_NAMES[key]
  496        if "{bid}" in key_name:
  497            if bid is None:
  498                return False
  499            key_name = key_name.format(bid=bid)
  500        else:
  501            if bid is not None:
  502                return False
  503        return name == (key_name + suffix)
  504
  505    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
  506        new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
  507        if new_name is None:
  508            raise ValueError(f"Can not map tensor {name!r}")
  509        return new_name
  510
  511    def set_gguf_parameters(self):
  512        raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
  513
  514    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
  515        del bid # unused
  516        return [(self.map_tensor_name(name), data_torch)]
  517
  518    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
  519        del name, new_name, bid, n_dims  # unused
  520
  521        return False
  522
  523    # some models need extra generated tensors (like rope_freqs)
  524    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
  525        return ()
  526
  527    def prepare_tensors(self):
  528        self.dequant_model()
  529
  530        # Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
  531        if self.tensor_map.mapping:
  532            max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
  533        else:
  534            max_name_len = len("vision_encoder.weight,")  # Default reasonable length
  535
  536        for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
  537            # we don't need these
  538            if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
  539                continue
  540
  541            old_dtype = data_torch.dtype
  542
  543            # convert any unsupported data types to float32
  544            if data_torch.dtype not in (torch.float16, torch.float32):
  545                data_torch = data_torch.to(torch.float32)
  546
  547            # use the first number-like part of the tensor name as the block id
  548            bid = None
  549            for part in name.split("."):
  550                if part.isdecimal():
  551                    bid = int(part)
  552                    break
  553
  554            for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
  555                # TODO: why do we squeeze here?
  556                # data = data_torch.squeeze().numpy()
  557                data = data_torch.numpy()
  558
  559                n_dims = len(data.shape)
  560                data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
  561
  562                # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
  563                if n_dims <= 1 or new_name.endswith("_norm.weight"):
  564                    data_qtype = gguf.GGMLQuantizationType.F32
  565
  566                # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
  567                # Some tensor types are always in float32
  568                if data_qtype is False and (
  569                    any(
  570                        self.match_model_tensor_name(new_name, key, bid)
  571                        for key in (
  572                            gguf.MODEL_TENSOR.FFN_GATE_INP,
  573                            gguf.MODEL_TENSOR.POS_EMBD,
  574                            gguf.MODEL_TENSOR.TOKEN_TYPES,
  575                            gguf.MODEL_TENSOR.SSM_CONV1D,
  576                            gguf.MODEL_TENSOR.SHORTCONV_CONV,
  577                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
  578                            gguf.MODEL_TENSOR.TIME_MIX_W1,
  579                            gguf.MODEL_TENSOR.TIME_MIX_W2,
  580                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
  581                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
  582                            gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
  583                            gguf.MODEL_TENSOR.POSNET_NORM1,
  584                            gguf.MODEL_TENSOR.POSNET_NORM2,
  585                            gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
  586                            gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
  587                            gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
  588                            gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
  589                            # Kimi KDA conv weights should be F32
  590                            gguf.MODEL_TENSOR.SSM_CONV1D_Q,
  591                            gguf.MODEL_TENSOR.SSM_CONV1D_K,
  592                            gguf.MODEL_TENSOR.SSM_CONV1D_V,
  593                        )
  594                    )
  595                    or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
  596                ):
  597                    data_qtype = gguf.GGMLQuantizationType.F32
  598
  599                if data_qtype is False and any(
  600                    self.match_model_tensor_name(new_name, key, bid)
  601                    for key in (
  602                        gguf.MODEL_TENSOR.TOKEN_EMBD,
  603                        gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
  604                        gguf.MODEL_TENSOR.OUTPUT,
  605                        gguf.MODEL_TENSOR.ALTUP_ROUTER,
  606                        gguf.MODEL_TENSOR.LAUREL_L,
  607                        gguf.MODEL_TENSOR.LAUREL_R,
  608                    )
  609                ):
  610                    if self.ftype in (
  611                        gguf.LlamaFileType.MOSTLY_TQ1_0,
  612                        gguf.LlamaFileType.MOSTLY_TQ2_0,
  613                    ):
  614                        # TODO: use Q4_K and Q6_K
  615                        data_qtype = gguf.GGMLQuantizationType.F16
  616
  617                # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
  618                if isinstance(data_qtype, bool):
  619                    if self.ftype == gguf.LlamaFileType.ALL_F32:
  620                        data_qtype = gguf.GGMLQuantizationType.F32
  621                    elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
  622                        data_qtype = gguf.GGMLQuantizationType.F16
  623                    elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
  624                        data_qtype = gguf.GGMLQuantizationType.BF16
  625                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
  626                        data_qtype = gguf.GGMLQuantizationType.Q8_0
  627                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
  628                        data_qtype = gguf.GGMLQuantizationType.TQ1_0
  629                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
  630                        data_qtype = gguf.GGMLQuantizationType.TQ2_0
  631                    else:
  632                        raise ValueError(f"Unknown file type: {self.ftype.name}")
  633
  634                try:
  635                    data = gguf.quants.quantize(data, data_qtype)
  636                except gguf.QuantError as e:
  637                    logger.warning("%s, %s", e, "falling back to F16")
  638                    data_qtype = gguf.GGMLQuantizationType.F16
  639                    data = gguf.quants.quantize(data, data_qtype)
  640
  641                shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
  642
  643                # reverse shape to make it similar to the internal ggml dimension order
  644                shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
  645
  646                # n_dims is implicit in the shape
  647                logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
  648
  649                self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
  650
  651    def set_type(self):
  652        self.gguf_writer.add_type(gguf.GGUFType.MODEL)
  653
  654    def prepare_metadata(self, vocab_only: bool):
  655
  656        total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
  657
  658        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
  659
  660        # If we are using HF model id, set the metadata name to the model id
  661        if self.remote_hf_model_id:
  662            self.metadata.name = self.remote_hf_model_id
  663
  664        # Fallback to model directory name if metadata name is still missing
  665        if self.metadata.name is None:
  666            self.metadata.name = self.dir_model.name
  667
  668        # Generate parameter weight class (useful for leader boards) if not yet determined
  669        if self.metadata.size_label is None and total_params > 0:
  670            self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
  671
  672        self.set_type()
  673
  674        logger.info("Set meta model")
  675        self.metadata.set_gguf_meta_model(self.gguf_writer)
  676
  677        logger.info("Set model parameters")
  678        self.set_gguf_parameters()
  679
  680        logger.info("Set model quantization version")
  681        self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
  682
  683    def write_vocab(self):
  684        raise NotImplementedError("write_vocab() must be implemented in subclasses")
  685
  686    def write(self):
  687        self.prepare_tensors()
  688        self.prepare_metadata(vocab_only=False)
  689        self.gguf_writer.write_header_to_file(path=self.fname_out)
  690        self.gguf_writer.write_kv_data_to_file()
  691        self.gguf_writer.write_tensors_to_file(progress=True)
  692        self.gguf_writer.close()
  693
  694    @staticmethod
  695    def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
  696        part_names: list[str] = []
  697        for filename in os.listdir(dir_model):
  698            if filename.startswith(prefix) and filename.endswith(suffix):
  699                part_names.append(filename)
  700
  701        part_names.sort()
  702
  703        return part_names
  704
  705    @staticmethod
  706    def load_hparams(dir_model: Path, is_mistral_format: bool):
  707        if is_mistral_format:
  708            with open(dir_model / "params.json", "r", encoding="utf-8") as f:
  709                config = json.load(f)
  710            return config
  711
  712        try:
  713            # for security reason, we don't allow loading remote code by default
  714            # if a model need remote code, we will fallback to config.json
  715            config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
  716        except Exception as e:
  717            logger.warning(f"Failed to load model config from {dir_model}: {e}")
  718            logger.warning("Trying to load config.json instead")
  719            with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  720                config = json.load(f)
  721        if "llm_config" in config:
  722            # rename for InternVL
  723            config["text_config"] = config["llm_config"]
  724        if "lm_config" in config:
  725            # rename for GlmASR
  726            config["text_config"] = config["lm_config"]
  727        if "thinker_config" in config:
  728            # rename for Qwen2.5-Omni
  729            config["text_config"] = config["thinker_config"]["text_config"]
  730        if "lfm" in config:
  731            # rename for LFM2-Audio
  732            config["text_config"] = config["lfm"]
  733        return config
  734
  735    @classmethod
  736    def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
  737        assert names
  738
  739        def func(modelcls: AnyModel) -> AnyModel:
  740            model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT
  741            for name in names:
  742                cls._model_classes[model_type][name] = modelcls
  743            return modelcls
  744        return func
  745
  746    @classmethod
  747    def print_registered_models(cls):
  748        for model_type, model_classes in cls._model_classes.items():
  749            logger.error(f"{model_type.name} models:")
  750            for name in sorted(model_classes.keys()):
  751                logger.error(f"  - {name}")
  752
  753    @classmethod
  754    def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]:
  755        try:
  756            return cls._model_classes[model_type][arch]
  757        except KeyError:
  758            raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
  759
  760
  761class TextModel(ModelBase):
  762    model_type = ModelType.TEXT
  763    hf_arch: str
  764
  765    def __init__(self, *args, **kwargs):
  766        super().__init__(*args, **kwargs)
  767        if not self.is_mistral_format:
  768            self.hf_arch = get_model_architecture(self.hparams, self.model_type)
  769        else:
  770            self.hf_arch = ""
  771
  772        if "text_config" in self.hparams:
  773            # move the text_config to the root level
  774            self.hparams = {**self.hparams, **self.hparams["text_config"]}
  775
  776        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
  777        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
  778
  779        self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
  780
  781        rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
  782        local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
  783
  784        # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
  785        if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
  786            if local_rope_theta is not None:
  787                self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
  788            if "rope_theta" not in self.rope_parameters and rope_theta is not None:
  789                self.rope_parameters["rope_theta"] = rope_theta
  790            if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
  791                self.rope_parameters["rope_type"] = rope_type
  792
  793    @classmethod
  794    def __init_subclass__(cls):
  795        # can't use an abstract property, because overriding it without type errors
  796        # would require using decorated functions instead of simply defining the property
  797        if "model_arch" not in cls.__dict__:
  798            raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
  799
  800    def set_vocab(self):
  801        self._set_vocab_gpt2()
  802
  803    def prepare_metadata(self, vocab_only: bool):
  804        super().prepare_metadata(vocab_only=vocab_only)
  805
  806        total_params = self.gguf_writer.get_total_parameter_count()[0]
  807        # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
  808        output_type: str = self.ftype.name.partition("_")[2]
  809
  810        # Filename Output
  811        if self.fname_out.is_dir():
  812            # Generate default filename based on model specification and available metadata
  813            if not vocab_only:
  814                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
  815            else:
  816                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
  817
  818            # Use the default filename
  819            self.fname_out = self.fname_out / f"{fname_default}.gguf"
  820        else:
  821            # Output path is a custom defined templated filename
  822            # Note: `not is_dir()` is used because `.is_file()` will not detect
  823            #       file template strings as it doesn't actually exist as a file
  824
  825            # Process templated file name with the output ftype, useful with the "auto" ftype
  826            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
  827
  828        logger.info("Set model tokenizer")
  829        self.set_vocab()
  830
  831    def set_gguf_parameters(self):
  832        self.gguf_writer.add_block_count(self.block_count)
  833
  834        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None:
  835            self.gguf_writer.add_context_length(n_ctx)
  836            logger.info(f"gguf: context length = {n_ctx}")
  837
  838        if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None:
  839            self.gguf_writer.add_embedding_length(n_embd)
  840            logger.info(f"gguf: embedding length = {n_embd}")
  841
  842        if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
  843            self.gguf_writer.add_feed_forward_length(n_ff)
  844            logger.info(f"gguf: feed forward length = {n_ff}")
  845
  846        if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None:
  847            self.gguf_writer.add_head_count(n_head)
  848            logger.info(f"gguf: head count = {n_head}")
  849
  850        if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None:
  851            self.gguf_writer.add_head_count_kv(n_head_kv)
  852            logger.info(f"gguf: key-value head count = {n_head_kv}")
  853
  854        # TODO: Handle "sliding_attention" similarly when models start implementing it
  855        rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
  856        if (rope_type := rope_params.get("rope_type")) is not None:
  857            rope_factor = rope_params.get("factor")
  858            rope_gguf_type = gguf.RopeScalingType.NONE
  859            if rope_type == "linear" and rope_factor is not None:
  860                rope_gguf_type = gguf.RopeScalingType.LINEAR
  861                self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
  862                self.gguf_writer.add_rope_scaling_factor(rope_factor)
  863            elif rope_type == "yarn" and rope_factor is not None:
  864                rope_gguf_type = gguf.RopeScalingType.YARN
  865                self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
  866                self.gguf_writer.add_rope_scaling_factor(rope_factor)
  867                self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
  868                if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None:
  869                    self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor)
  870                if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None:
  871                    self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor)
  872                if (yarn_beta_fast := rope_params.get("beta_fast")) is not None:
  873                    self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast)
  874                if (yarn_beta_slow := rope_params.get("beta_slow")) is not None:
  875                    self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow)
  876                # self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
  877            elif rope_type == "su" or rope_type == "longrope":
  878                rope_gguf_type = gguf.RopeScalingType.LONGROPE
  879                self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
  880            elif rope_type == "dynamic":
  881                # HunYuan, handled in model class
  882                pass
  883            elif rope_type.lower() == "llama3":
  884                # Handled in generate_extra_tensors
  885                pass
  886            else:
  887                logger.warning(f"Unknown RoPE type: {rope_type}")
  888            logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
  889
  890        if "mrope_section" in self.rope_parameters:
  891            mrope_section = self.rope_parameters["mrope_section"]
  892            # Pad to 4 dimensions [time, height, width, extra]
  893            while len(mrope_section) < 4:
  894                mrope_section.append(0)
  895            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
  896            logger.info(f"gguf: mrope sections: {mrope_section[:4]}")
  897
  898        if (rope_theta := rope_params.get("rope_theta")) is not None:
  899            self.gguf_writer.add_rope_freq_base(rope_theta)
  900            logger.info(f"gguf: rope theta = {rope_theta}")
  901        if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None:
  902            self.gguf_writer.add_rope_freq_base_swa(local_rope_theta)
  903            logger.info(f"gguf: rope theta swa = {local_rope_theta}")
  904        if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
  905            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
  906            logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
  907        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
  908            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
  909            logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
  910        if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
  911            self.gguf_writer.add_expert_count(n_experts)
  912            logger.info(f"gguf: expert count = {n_experts}")
  913        if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None:
  914            self.gguf_writer.add_expert_used_count(n_experts_used)
  915            logger.info(f"gguf: experts used count = {n_experts_used}")
  916        if (n_expert_groups := self.hparams.get("n_group")) is not None:
  917            self.gguf_writer.add_expert_group_count(n_expert_groups)
  918            logger.info(f"gguf: expert groups count = {n_expert_groups}")
  919        if (n_group_used := self.hparams.get("topk_group")) is not None:
  920            self.gguf_writer.add_expert_group_used_count(n_group_used)
  921            logger.info(f"gguf: expert groups used count = {n_group_used}")
  922
  923        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
  924            if score_func == "sigmoid":
  925                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
  926            elif score_func == "softmax":
  927                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
  928            else:
  929                raise ValueError(f"Unsupported expert score gating function value: {score_func}")
  930            logger.info(f"gguf: expert score gating function = {score_func}")
  931
  932        if (head_dim := self.hparams.get("head_dim")) is not None:
  933            self.gguf_writer.add_key_length(head_dim)
  934            self.gguf_writer.add_value_length(head_dim)
  935
  936        self.gguf_writer.add_file_type(self.ftype)
  937        logger.info(f"gguf: file type = {self.ftype}")
  938
  939    def write_vocab(self):
  940        if len(self.gguf_writer.tensors) != 1:
  941            raise ValueError('Splitting the vocabulary is not supported')
  942
  943        self.prepare_metadata(vocab_only=True)
  944        self.gguf_writer.write_header_to_file(path=self.fname_out)
  945        self.gguf_writer.write_kv_data_to_file()
  946        self.gguf_writer.close()
  947
  948    def does_token_look_special(self, token: str | bytes) -> bool:
  949        if isinstance(token, (bytes, bytearray)):
  950            token_text = token.decode(encoding="utf-8")
  951        elif isinstance(token, memoryview):
  952            token_text = token.tobytes().decode(encoding="utf-8")
  953        else:
  954            token_text = token
  955
  956        # Some models mark some added tokens which ought to be control tokens as not special.
  957        # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
  958        seems_special = token_text in (
  959            "<pad>",  # deepseek-coder
  960            "<mask>", "<2mass>", "[@BOS@]",  # gemma{,-2}
  961        )
  962
  963        seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
  964        seems_special = seems_special or (token_text.startswith("<๏ฝœ") and token_text.endswith("๏ฝœ>"))  # deepseek-coder
  965
  966        # TODO: should these be marked as UNUSED instead? (maybe not)
  967        seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">"))  # gemma{,-2}
  968
  969        return seems_special
  970
  971    # used for GPT-2 BPE and WordPiece vocabs
  972    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
  973        tokens: list[str] = []
  974        toktypes: list[int] = []
  975
  976        from transformers import AutoTokenizer
  977        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
  978        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
  979        assert max(tokenizer.vocab.values()) < vocab_size
  980
  981        tokpre = self.get_vocab_base_pre(tokenizer)
  982
  983        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
  984        added_vocab = tokenizer.get_added_vocab()
  985
  986        added_tokens_decoder = tokenizer.added_tokens_decoder
  987
  988        for i in range(vocab_size):
  989            if i not in reverse_vocab:
  990                tokens.append(f"[PAD{i}]")
  991                toktypes.append(gguf.TokenType.UNUSED)
  992            else:
  993                token: str = reverse_vocab[i]
  994                if token in added_vocab:
  995                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
  996                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
  997                    if not added_tokens_decoder[i].normalized:
  998                        previous_token = token
  999                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
 1000                        if previous_token != token:
 1001                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
 1002
 1003                    if added_tokens_decoder[i].special or self.does_token_look_special(token):
 1004                        toktypes.append(gguf.TokenType.CONTROL)
 1005                    else:
 1006                        # NOTE: this was added for Gemma.
 1007                        # Encoding and decoding the tokens above isn't sufficient for this case.
 1008                        token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")  # pre-normalize user-defined spaces
 1009                        toktypes.append(gguf.TokenType.USER_DEFINED)
 1010                else:
 1011                    toktypes.append(gguf.TokenType.NORMAL)
 1012                tokens.append(token)
 1013
 1014        return tokens, toktypes, tokpre
 1015
 1016    # NOTE: this function is generated by convert_hf_to_gguf_update.py
 1017    #       do not modify it manually!
 1018    # ref:  https://github.com/ggml-org/llama.cpp/pull/6920
 1019    # Marker: Start get_vocab_base_pre
 1020    def get_vocab_base_pre(self, tokenizer) -> str:
 1021        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
 1022        # is specific for the BPE pre-tokenizer used by the model
 1023        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
 1024        # use in llama.cpp to implement the same pre-tokenizer
 1025
 1026        chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n๐Ÿš€ (normal) ๐Ÿ˜ถ\u200d๐ŸŒซ๏ธ (multiple emojis concatenated) โœ… ๐Ÿฆ™๐Ÿฆ™ 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 แž€แžถแž“แŸ‹แžแŸ‚แž–แžทแžŸแŸแžŸแžขแžถแž…๐Ÿ˜ ?ๆˆ‘ๆƒณๅœจappleๅทฅไฝœ1314151ๅคฉ๏ฝž ------======= ะฝะตั‰ะพ ะฝะฐ ะ‘ัŠะปะณะฐั€ัะบะธ \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 1027
 1028        chktok = tokenizer.encode(chktxt)
 1029        chkhsh = sha256(str(chktok).encode()).hexdigest()
 1030
 1031        logger.debug(f"chktok: {chktok}")
 1032        logger.debug(f"chkhsh: {chkhsh}")
 1033
 1034        res = None
 1035
 1036        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
 1037        #       or pull the latest version of the model from Huggingface
 1038        #       don't edit the hashes manually!
 1039        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
 1040            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
 1041            res = "chatglm-bpe"
 1042        if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
 1043            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
 1044            res = "chatglm-bpe"
 1045        if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
 1046            # ref: https://huggingface.co/THUDM/glm-4-9b-hf
 1047            res = "glm4"
 1048        if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902":
 1049            # ref: https://huggingface.co/zai-org/GLM-4.5-Air
 1050            res = "glm4"
 1051        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
 1052            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
 1053            res = "minerva-7b"
 1054        if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
 1055            # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
 1056            res = "hunyuan"
 1057        if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6":
 1058            # ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct
 1059            res = "hunyuan-dense"
 1060        if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
 1061            # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
 1062            res = "falcon-h1"
 1063        if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
 1064            # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
 1065            res = "falcon-h1"
 1066        if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
 1067            # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
 1068            res = "falcon-h1"
 1069        if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
 1070            # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
 1071            res = "falcon-h1"
 1072        if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
 1073            # ref: https://huggingface.co/moonshotai/Kimi-K2-Base
 1074            res = "kimi-k2"
 1075        if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
 1076            # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
 1077            res = "qwen2"
 1078        if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
 1079            # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
 1080            res = "grok-2"
 1081        if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
 1082            # ref: https://huggingface.co/aari1995/German_Semantic_V3
 1083            res = "jina-v2-de"
 1084        if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267":
 1085            # ref: https://huggingface.co/zai-org/GLM-4.7-Flash
 1086            res = "glm4"
 1087        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
 1088            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
 1089            res = "llama-bpe"
 1090        if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
 1091            # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
 1092            res = "deepseek-llm"
 1093        if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
 1094            # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
 1095            res = "deepseek-coder"
 1096        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
 1097            # ref: https://huggingface.co/tiiuae/falcon-7b
 1098            res = "falcon"
 1099        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
 1100            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
 1101            res = "bert-bge"
 1102        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
 1103            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
 1104            res = "falcon3"
 1105        if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
 1106            # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
 1107            res = "bert-bge-large"
 1108        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
 1109            # ref: https://huggingface.co/mosaicml/mpt-7b
 1110            res = "mpt"
 1111        if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
 1112            # ref: https://huggingface.co/bigcode/starcoder2-3b
 1113            res = "starcoder"
 1114        if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
 1115            # ref: https://huggingface.co/openai-community/gpt2
 1116            res = "gpt-2"
 1117        if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
 1118            # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
 1119            res = "stablelm2"
 1120        if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
 1121            # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
 1122            res = "refact"
 1123        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
 1124            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
 1125            res = "command-r"
 1126        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
 1127            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
 1128            res = "qwen2"
 1129        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
 1130            # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
 1131            res = "olmo"
 1132        if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
 1133            # ref: https://huggingface.co/databricks/dbrx-base
 1134            res = "dbrx"
 1135        if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
 1136            # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
 1137            res = "jina-v1-en"
 1138        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
 1139            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
 1140            res = "jina-v2-en"
 1141        if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
 1142            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
 1143            res = "jina-v2-es"
 1144        if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
 1145            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
 1146            res = "jina-v2-de"
 1147        if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
 1148            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
 1149            res = "smaug-bpe"
 1150        if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
 1151            # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
 1152            res = "poro-chat"
 1153        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
 1154            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
 1155            res = "jina-v2-code"
 1156        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
 1157            # ref: https://huggingface.co/LumiOpen/Viking-7B
 1158            res = "viking"
 1159        if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
 1160            # ref: https://huggingface.co/core42/jais-13b
 1161            res = "jais"
 1162        if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
 1163            # ref: https://huggingface.co/WisdomShell/CodeShell-7B
 1164            res = "codeshell"
 1165        if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
 1166            # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
 1167            res = "tekken"
 1168        if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
 1169            # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
 1170            res = "smollm"
 1171        if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
 1172            # ref: https://huggingface.co/bigscience/bloom
 1173            res = "bloom"
 1174        if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
 1175            # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
 1176            res = "gpt3-finnish"
 1177        if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
 1178            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
 1179            res = "exaone"
 1180        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
 1181            # ref: https://huggingface.co/microsoft/phi-2
 1182            res = "phi-2"
 1183        if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
 1184            # ref: https://huggingface.co/facebook/chameleon-7b
 1185            res = "chameleon"
 1186        if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
 1187            # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
 1188            res = "roberta-bpe"
 1189        if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
 1190            # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
 1191            res = "gigachat"
 1192        if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
 1193            # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
 1194            res = "megrez"
 1195        if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
 1196            # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
 1197            res = "deepseek-v3"
 1198        if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
 1199            # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 1200            res = "deepseek-r1-qwen"
 1201        if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
 1202            # ref: https://huggingface.co/Xenova/gpt-4o
 1203            res = "gpt-4o"
 1204        if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
 1205            # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
 1206            res = "superbpe"
 1207        if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
 1208            # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
 1209            res = "trillion"
 1210        if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
 1211            # ref: https://huggingface.co/inclusionAI/Ling-lite
 1212            res = "bailingmoe"
 1213        if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
 1214            # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
 1215            res = "llama4"
 1216        if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
 1217            # ref: https://huggingface.co/mistral-community/pixtral-12b
 1218            res = "pixtral"
 1219        if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
 1220            # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
 1221            res = "seed-coder"
 1222        if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
 1223            # ref: https://huggingface.co/skt/A.X-4.0
 1224            res = "a.x-4.0"
 1225        if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
 1226            # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
 1227            res = "midm-2.0"
 1228        if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
 1229            # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
 1230            res = "lfm2"
 1231        if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
 1232            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
 1233            res = "exaone4"
 1234        if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
 1235            # ref: https://huggingface.co/JetBrains/Mellum-4b-base
 1236            res = "mellum"
 1237        if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152":
 1238            # ref: https://huggingface.co/answerdotai/ModernBERT-base
 1239            res = "modern-bert"
 1240        if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df":
 1241            # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer
 1242            res = "afmoe"
 1243        if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
 1244            # ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0
 1245            res = "bailingmoe2"
 1246        if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
 1247            # ref: https://huggingface.co/ibm-granite/granite-docling-258M
 1248            res = "granite-docling"
 1249        if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
 1250            # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
 1251            res = "minimax-m2"
 1252        if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
 1253            # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
 1254            res = "kormo"
 1255        if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
 1256            # ref: https://huggingface.co/tencent/Youtu-LLM-2B
 1257            res = "youtu"
 1258        if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
 1259            # ref: https://huggingface.co/upstage/Solar-Open-100B
 1260            res = "solar-open"
 1261        if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f":
 1262            # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B
 1263            res = "exaone-moe"
 1264        if chkhsh == "d30d75d9059f1aa2c19359de71047b3ae408c70875e8a3ccf8c5fba56c9d8af4":
 1265            # ref: https://huggingface.co/Qwen/Qwen3.5-9B-Instruct
 1266            res = "qwen35"
 1267
 1268        if res is None:
 1269            logger.warning("\n")
 1270            logger.warning("**************************************************************************************")
 1271            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
 1272            logger.warning("**          There are 2 possible reasons for this:")
 1273            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
 1274            logger.warning("**          - the pre-tokenization config has changed upstream")
 1275            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
 1276            logger.warning("** ref:     https://github.com/ggml-org/llama.cpp/pull/6920")
 1277            logger.warning("**")
 1278            logger.warning(f"** chkhsh:  {chkhsh}")
 1279            logger.warning("**************************************************************************************")
 1280            logger.warning("\n")
 1281            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
 1282
 1283        logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
 1284        logger.debug(f"chkhsh: {chkhsh}")
 1285
 1286        return res
 1287        # Marker: End get_vocab_base_pre
 1288
 1289    def _set_vocab_none(self) -> None:
 1290        self.gguf_writer.add_tokenizer_model("none")
 1291
 1292    def _set_vocab_gpt2(self) -> None:
 1293        tokens, toktypes, tokpre = self.get_vocab_base()
 1294        self.gguf_writer.add_tokenizer_model("gpt2")
 1295        self.gguf_writer.add_tokenizer_pre(tokpre)
 1296        self.gguf_writer.add_token_list(tokens)
 1297        self.gguf_writer.add_token_types(toktypes)
 1298
 1299        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
 1300        special_vocab.add_to_gguf(self.gguf_writer)
 1301
 1302    def _set_vocab_qwen(self):
 1303        dir_model = self.dir_model
 1304        hparams = self.hparams
 1305        tokens: list[str] = []
 1306        toktypes: list[int] = []
 1307
 1308        from transformers import AutoTokenizer
 1309        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
 1310        vocab_size = hparams["vocab_size"]
 1311        assert max(tokenizer.get_vocab().values()) < vocab_size
 1312
 1313        tokpre = self.get_vocab_base_pre(tokenizer)
 1314
 1315        merges = []
 1316        vocab = {}
 1317        mergeable_ranks = tokenizer.mergeable_ranks
 1318        for token, rank in mergeable_ranks.items():
 1319            vocab[QwenModel.token_bytes_to_string(token)] = rank
 1320            if len(token) == 1:
 1321                continue
 1322            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
 1323            assert len(merged) == 2
 1324            merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
 1325
 1326        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
 1327        added_vocab = tokenizer.special_tokens
 1328        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
 1329
 1330        for i in range(vocab_size):
 1331            if i not in reverse_vocab:
 1332                tokens.append(f"[PAD{i}]")
 1333                toktypes.append(gguf.TokenType.UNUSED)
 1334            elif reverse_vocab[i] in added_vocab:
 1335                tokens.append(reverse_vocab[i])
 1336                toktypes.append(gguf.TokenType.CONTROL)
 1337            else:
 1338                tokens.append(reverse_vocab[i])
 1339                toktypes.append(gguf.TokenType.NORMAL)
 1340
 1341        self.gguf_writer.add_tokenizer_model("gpt2")
 1342        self.gguf_writer.add_tokenizer_pre(tokpre)
 1343        self.gguf_writer.add_token_list(tokens)
 1344        self.gguf_writer.add_token_types(toktypes)
 1345
 1346        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
 1347        special_vocab.merges = merges
 1348        # only add special tokens when they were not already loaded from config.json
 1349        if len(special_vocab.special_token_ids) == 0:
 1350            special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
 1351            special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
 1352        # this one is usually not in config.json anyway
 1353        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
 1354        special_vocab.add_to_gguf(self.gguf_writer)
 1355
 1356    def _set_vocab_sentencepiece(self, add_to_gguf=True):
 1357        tokens, scores, toktypes = self._create_vocab_sentencepiece()
 1358
 1359        self.gguf_writer.add_tokenizer_model("llama")
 1360        self.gguf_writer.add_tokenizer_pre("default")
 1361        self.gguf_writer.add_token_list(tokens)
 1362        self.gguf_writer.add_token_scores(scores)
 1363        self.gguf_writer.add_token_types(toktypes)
 1364
 1365        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 1366        special_vocab.add_to_gguf(self.gguf_writer)
 1367
 1368    def _create_vocab_sentencepiece(self):
 1369        from sentencepiece import SentencePieceProcessor
 1370
 1371        tokenizer_path = self.dir_model / 'tokenizer.model'
 1372
 1373        if not tokenizer_path.is_file():
 1374            raise FileNotFoundError(f"File not found: {tokenizer_path}")
 1375
 1376        tokenizer = SentencePieceProcessor()
 1377        tokenizer.LoadFromFile(str(tokenizer_path))
 1378
 1379        vocab_size = self.find_hparam([
 1380            "vocab_size_per_layer_input", # gemma3n
 1381            "vocab_size",
 1382        ], optional=True) or tokenizer.vocab_size()
 1383
 1384        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
 1385        scores: list[float] = [-10000.0] * vocab_size
 1386        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
 1387
 1388        for token_id in range(tokenizer.vocab_size()):
 1389            if token_id >= vocab_size:
 1390                logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
 1391                break
 1392
 1393            piece = tokenizer.IdToPiece(token_id)
 1394            text = piece.encode("utf-8")
 1395            score = tokenizer.GetScore(token_id)
 1396
 1397            toktype = SentencePieceTokenTypes.NORMAL
 1398            if tokenizer.IsUnknown(token_id):
 1399                toktype = SentencePieceTokenTypes.UNKNOWN
 1400            elif tokenizer.IsControl(token_id):
 1401                toktype = SentencePieceTokenTypes.CONTROL
 1402            elif tokenizer.IsUnused(token_id):
 1403                toktype = SentencePieceTokenTypes.UNUSED
 1404            elif tokenizer.IsByte(token_id):
 1405                toktype = SentencePieceTokenTypes.BYTE
 1406
 1407            tokens[token_id] = text
 1408            scores[token_id] = score
 1409            toktypes[token_id] = toktype
 1410
 1411        added_tokens_file = self.dir_model / 'added_tokens.json'
 1412        if added_tokens_file.is_file():
 1413            with open(added_tokens_file, "r", encoding="utf-8") as f:
 1414                added_tokens_json = json.load(f)
 1415                for key in added_tokens_json:
 1416                    token_id = added_tokens_json[key]
 1417                    if token_id >= vocab_size:
 1418                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
 1419                        continue
 1420
 1421                    tokens[token_id] = key.encode("utf-8")
 1422                    scores[token_id] = -1000.0
 1423                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 1424
 1425        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
 1426        if tokenizer_config_file.is_file():
 1427            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
 1428                tokenizer_config_json = json.load(f)
 1429                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
 1430                for token_id, token_data in added_tokens_decoder.items():
 1431                    token_id = int(token_id)
 1432                    token: str = token_data["content"]
 1433                    if token_id >= vocab_size:
 1434                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
 1435                        continue
 1436                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
 1437                        if tokens[token_id] != token.encode("utf-8"):
 1438                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
 1439                    if token_data.get("special") or self.does_token_look_special(token):
 1440                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
 1441                    else:
 1442                        token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")  # pre-normalize user-defined spaces
 1443                        toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 1444
 1445                    scores[token_id] = -1000.0
 1446                    tokens[token_id] = token.encode("utf-8")
 1447
 1448        if vocab_size > len(tokens):
 1449            pad_count = vocab_size - len(tokens)
 1450            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
 1451            for i in range(1, pad_count + 1):
 1452                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
 1453                scores.append(-1000.0)
 1454                toktypes.append(SentencePieceTokenTypes.UNUSED)
 1455
 1456        return tokens, scores, toktypes
 1457
 1458    def _set_vocab_llama_hf(self):
 1459        vocab = gguf.LlamaHfVocab(self.dir_model)
 1460        tokens = []
 1461        scores = []
 1462        toktypes = []
 1463
 1464        for text, score, toktype in vocab.all_tokens():
 1465            tokens.append(text)
 1466            scores.append(score)
 1467            toktypes.append(toktype)
 1468
 1469        assert len(tokens) == vocab.vocab_size
 1470
 1471        self.gguf_writer.add_tokenizer_model("llama")
 1472        self.gguf_writer.add_tokenizer_pre("default")
 1473        self.gguf_writer.add_token_list(tokens)
 1474        self.gguf_writer.add_token_scores(scores)
 1475        self.gguf_writer.add_token_types(toktypes)
 1476
 1477        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 1478        special_vocab.add_to_gguf(self.gguf_writer)
 1479
 1480    def _set_vocab_rwkv_world(self):
 1481        assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
 1482        vocab_size = self.hparams.get("vocab_size", 65536)
 1483
 1484        tokens: list[bytes] = ['<s>'.encode("utf-8")]
 1485        toktypes: list[int] = [gguf.TokenType.CONTROL]
 1486
 1487        with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
 1488            lines = f.readlines()
 1489            for line in lines:
 1490                parts = line.split(' ')
 1491                assert len(parts) >= 3
 1492                token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
 1493                token = token.encode("utf-8") if isinstance(token, str) else token
 1494                assert isinstance(token, bytes)
 1495                assert len(token) == token_len
 1496                token_text: str = repr(token)[2:-1]  # "b'\xff'" -> "\xff"
 1497                tokens.append(token_text.encode("utf-8"))
 1498                toktypes.append(gguf.TokenType.NORMAL)
 1499        remainder = vocab_size - len(tokens)
 1500        assert remainder >= 0
 1501        for i in range(len(tokens), vocab_size):
 1502            tokens.append(f"[PAD{i}]".encode("utf-8"))
 1503            toktypes.append(gguf.TokenType.UNUSED)
 1504
 1505        self.gguf_writer.add_tokenizer_model("rwkv")
 1506        self.gguf_writer.add_token_list(tokens)
 1507        self.gguf_writer.add_token_types(toktypes)
 1508        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
 1509        if special_vocab.chat_template is None:
 1510            template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja"
 1511            if template_path.is_file():
 1512                with open(template_path, "r", encoding="utf-8") as f:
 1513                    template = f.read()
 1514            else:
 1515                template = "rwkv-world"
 1516            special_vocab.chat_template = template
 1517        # hack: Add '\n\n' as the EOT token to make it chat normally
 1518        special_vocab._set_special_token("eot", 261)
 1519        # hack: Override these as they have already been set (incorrectly)
 1520        special_vocab.special_token_ids["bos"] = 0
 1521        special_vocab.special_token_ids["eos"] = 0
 1522
 1523        special_vocab.add_to_gguf(self.gguf_writer)
 1524
 1525    def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
 1526        tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
 1527        logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
 1528        vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
 1529
 1530        default_pre = "mpt" if model_name == "gpt-neox" else "default"
 1531
 1532        field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
 1533        assert field  # tokenizer model
 1534        self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
 1535
 1536        field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
 1537        self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
 1538
 1539        field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
 1540        assert field  # token list
 1541        self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
 1542
 1543        if model_name == "llama-spm":
 1544            field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
 1545            assert field  # token scores
 1546            self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
 1547
 1548        field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
 1549        assert field  # token types
 1550        self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
 1551
 1552        if model_name != "llama-spm":
 1553            field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
 1554            assert field  # token merges
 1555            self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
 1556
 1557        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
 1558            self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
 1559        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
 1560            self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
 1561        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
 1562            self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
 1563        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
 1564            self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
 1565        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
 1566            self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
 1567        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
 1568            self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
 1569
 1570    def _try_set_pooling_type(self) -> None:
 1571        # get pooling path
 1572        pooling_path = None
 1573        module_path = self.dir_model / "modules.json"
 1574        if module_path.is_file():
 1575            with open(module_path, encoding="utf-8") as f:
 1576                modules = json.load(f)
 1577            for mod in modules:
 1578                if mod["type"] == "sentence_transformers.models.Pooling":
 1579                    pooling_path = mod["path"]
 1580                    break
 1581
 1582        # get pooling type
 1583        if pooling_path is not None:
 1584            with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
 1585                pooling = json.load(f)
 1586            if pooling["pooling_mode_mean_tokens"]:
 1587                pooling_type = gguf.PoolingType.MEAN
 1588            elif pooling["pooling_mode_cls_token"]:
 1589                pooling_type = gguf.PoolingType.CLS
 1590            elif pooling["pooling_mode_lasttoken"]:
 1591                pooling_type = gguf.PoolingType.LAST
 1592            else:
 1593                raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
 1594            self.gguf_writer.add_pooling_type(pooling_type)
 1595
 1596    def _set_vocab_glmedge(self):
 1597        from transformers import AutoTokenizer
 1598        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
 1599        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
 1600        tokens, toktypes, tokpre = self.get_vocab_base()
 1601        self.gguf_writer.add_tokenizer_model("gpt2")
 1602        self.gguf_writer.add_tokenizer_pre(tokpre)
 1603        self.gguf_writer.add_token_list(tokens)
 1604        self.gguf_writer.add_token_types(toktypes)
 1605        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
 1606        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
 1607        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
 1608        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
 1609        special_vocab.add_to_gguf(self.gguf_writer)
 1610
 1611    def _set_vocab_interns1(self):
 1612        tokens: list[str] = []
 1613        toktypes: list[int] = []
 1614
 1615        from transformers import AutoTokenizer
 1616        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
 1617        vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
 1618        vocab_size = self.hparams.get("vocab_size", len(vocab))
 1619        assert max(vocab.values()) < vocab_size
 1620
 1621        tokpre = self.get_vocab_base_pre(tokenizer)
 1622
 1623        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
 1624        added_vocab = tokenizer.get_added_vocab()
 1625
 1626        added_tokens_decoder = tokenizer.added_tokens_decoder
 1627
 1628        for i in range(vocab_size):
 1629            if i not in reverse_vocab:
 1630                tokens.append(f"[PAD{i}]")
 1631                toktypes.append(gguf.TokenType.UNUSED)
 1632            else:
 1633                token: str = reverse_vocab[i]
 1634                if token in added_vocab:
 1635                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
 1636                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
 1637                    if not added_tokens_decoder[i].normalized:
 1638                        previous_token = token
 1639                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
 1640                        if previous_token != token:
 1641                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
 1642
 1643                    if added_tokens_decoder[i].special or self.does_token_look_special(token):
 1644                        toktypes.append(gguf.TokenType.CONTROL)
 1645                    else:
 1646                        toktypes.append(gguf.TokenType.USER_DEFINED)
 1647                else:
 1648                    toktypes.append(gguf.TokenType.NORMAL)
 1649                tokens.append(token)
 1650
 1651        self.gguf_writer.add_tokenizer_model("gpt2")
 1652        self.gguf_writer.add_tokenizer_pre(tokpre)
 1653        self.gguf_writer.add_token_list(tokens)
 1654        self.gguf_writer.add_token_types(toktypes)
 1655
 1656        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
 1657        special_vocab._set_special_token("bos", 151643)
 1658        special_vocab.add_to_gguf(self.gguf_writer)
 1659
 1660    def _set_vocab_mistral(self):
 1661        if not _mistral_common_installed:
 1662            raise ImportError(_mistral_import_error_msg)
 1663
 1664        vocab = MistralVocab(self.dir_model)
 1665        logger.info(
 1666            f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
 1667        )
 1668
 1669        self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
 1670
 1671        tokens = []
 1672        scores = []
 1673        toktypes = []
 1674
 1675        for text, score, toktype in vocab.all_tokens():
 1676            tokens.append(text)
 1677            scores.append(score)
 1678            toktypes.append(toktype)
 1679
 1680        assert len(tokens) == vocab.vocab_size, (
 1681            f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
 1682        )
 1683
 1684        if vocab.tokenizer_type == MistralTokenizerType.tekken:
 1685            self.gguf_writer.add_tokenizer_pre("tekken")
 1686            self.gguf_writer.add_token_merges(
 1687                vocab.extract_vocab_merges_from_model()
 1688            )
 1689
 1690        logger.info(
 1691            f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
 1692        )
 1693
 1694        self.gguf_writer.add_bos_token_id(vocab.bos_id)
 1695        self.gguf_writer.add_eos_token_id(vocab.eos_id)
 1696        self.gguf_writer.add_unk_token_id(vocab.unk_id)
 1697        self.gguf_writer.add_pad_token_id(vocab.pad_id)
 1698
 1699        self.gguf_writer.add_token_list(tokens)
 1700        self.gguf_writer.add_token_scores(scores)
 1701        self.gguf_writer.add_token_types(toktypes)
 1702        self.gguf_writer.add_vocab_size(vocab.vocab_size)
 1703
 1704        self.gguf_writer.add_add_bos_token(True)
 1705        self.gguf_writer.add_add_eos_token(False)
 1706
 1707        local_template_file_path = self.dir_model / "chat_template.jinja"
 1708
 1709        if self.is_mistral_format and local_template_file_path.is_file():
 1710            # Ministral-3 and other new Mistral models come with chat templates.
 1711            # ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
 1712            logger.info("Using an existing Mistral local chat template.")
 1713
 1714            with open(local_template_file_path, "r", encoding="utf-8") as f:
 1715                template = f.read()
 1716        elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
 1717            template_dir = Path(__file__).parent / "models/templates/"
 1718
 1719            # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
 1720            if self.is_mistral_format:
 1721                logger.info(
 1722                    "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
 1723                    "Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
 1724                )
 1725            template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
 1726        else:
 1727            logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
 1728            template = None
 1729
 1730        if template is not None:
 1731            self.gguf_writer.add_chat_template(template)
 1732
 1733    def _set_vocab_plamo(self):
 1734        # PLaMo models use a custom tokenizer with a .jsonl file
 1735        tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
 1736        tokenizer_config_path = self.dir_model / "tokenizer_config.json"
 1737
 1738        if not tokenizer_jsonl_path.is_file():
 1739            raise FileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}")
 1740
 1741        # Load tokenizer config
 1742        with open(tokenizer_config_path, "r", encoding="utf-8") as f:
 1743            tokenizer_config = json.load(f)
 1744
 1745        # Load tokens from JSONL file (actually a list format)
 1746        tokens = []
 1747        scores = []
 1748        toktypes = []
 1749
 1750        with open(tokenizer_jsonl_path, "r", encoding="utf-8") as f:
 1751            for line_num, line in enumerate(f):
 1752                if line.strip():
 1753                    token_data = json.loads(line)
 1754                    # Format: [token, score, type, ?, ?, ?, ?]
 1755                    token = token_data[0].encode("utf-8")
 1756                    score = float(token_data[1])
 1757                    token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
 1758
 1759                    tokens.append(token)
 1760                    scores.append(score)
 1761
 1762                    if token_type_str == "UNKNOWN":
 1763                        toktypes.append(gguf.TokenType.UNKNOWN)
 1764                    elif token_type_str == "CONTROL":
 1765                        toktypes.append(gguf.TokenType.CONTROL)
 1766                    elif token_type_str == "BYTE":
 1767                        toktypes.append(gguf.TokenType.BYTE)
 1768                    else:
 1769                        token_str = token_data[0]
 1770                        if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
 1771                            toktypes.append(gguf.TokenType.CONTROL)
 1772                        else:
 1773                            toktypes.append(gguf.TokenType.NORMAL)
 1774
 1775        vocab_size = self.hparams["vocab_size"]
 1776        if vocab_size > len(tokens):
 1777            pad_count = vocab_size - len(tokens)
 1778            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
 1779            for i in range(1, pad_count + 1):
 1780                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
 1781                scores.append(-1000.0)
 1782                toktypes.append(gguf.TokenType.UNUSED)
 1783
 1784        self.gguf_writer.add_tokenizer_model("plamo2")
 1785        self.gguf_writer.add_tokenizer_pre("default")
 1786        self.gguf_writer.add_token_list(tokens)
 1787        self.gguf_writer.add_token_scores(scores)
 1788        self.gguf_writer.add_token_types(toktypes)
 1789
 1790        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
 1791            token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
 1792            self.gguf_writer.add_bos_token_id(token_id)
 1793        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
 1794            token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
 1795            self.gguf_writer.add_eos_token_id(token_id)
 1796        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
 1797            token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
 1798            self.gguf_writer.add_pad_token_id(token_id)
 1799        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
 1800            token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
 1801            self.gguf_writer.add_sep_token_id(token_id)
 1802        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
 1803            token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
 1804            self.gguf_writer.add_unk_token_id(token_id)
 1805
 1806        # Add <|plamo:op|> as EOT to ensure appropriate end of generation
 1807        self.gguf_writer.add_eot_token_id(4)
 1808
 1809        self.gguf_writer.add_add_space_prefix(False)
 1810
 1811
 1812class MmprojModel(ModelBase):
 1813    model_type = ModelType.MMPROJ
 1814    model_arch = gguf.MODEL_ARCH.MMPROJ
 1815    preprocessor_config: dict[str, Any]
 1816    global_config: dict[str, Any]
 1817
 1818    n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers"]
 1819
 1820    has_vision_encoder: bool = True # by default
 1821    has_audio_encoder: bool = False
 1822
 1823    # for models having multiple encoders, we need to separate their hparams
 1824    hparams_vision: dict[str, Any] | None = None
 1825    hparams_audio: dict[str, Any] | None = None
 1826
 1827    def __init__(self, *args, **kwargs):
 1828        super().__init__(*args, **kwargs)
 1829
 1830        if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
 1831            raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
 1832
 1833        # get n_embd of the text model
 1834        if not self.is_mistral_format:
 1835            if "text_config" not in self.hparams:
 1836                self.hparams["text_config"] = {}
 1837            if "audio_config" not in self.hparams:
 1838                self.hparams["audio_config"] = {}
 1839            text_config = {**self.hparams, **self.hparams["text_config"]}
 1840            self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
 1841        else:
 1842            text_config = {
 1843                k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"]
 1844            }
 1845            self.n_embd_text = text_config.get("hidden_dim", 0)
 1846
 1847        assert self.n_embd_text > 0, "n_embd not found in hparams"
 1848
 1849        # move vision config to the top level, while preserving the original hparams in global_config
 1850        import copy
 1851        self.global_config = copy.deepcopy(self.hparams)
 1852        self.hparams_vision = self.get_vision_config()
 1853        self.hparams_audio = self.get_audio_config()
 1854
 1855        if self.hparams_vision is None and self.hparams_audio is None:
 1856            raise ValueError("vision_config / audio_config not found in hparams")
 1857
 1858        # for compat with vision-only models
 1859        self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
 1860
 1861        # TODO @ngxson : this is a hack to support both vision and audio encoders
 1862        have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
 1863        self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
 1864        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
 1865
 1866        # load preprocessor config
 1867        self.preprocessor_config = {}
 1868
 1869        # prefer preprocessor_config.json if possible
 1870        preprocessor_config_path = self.dir_model / "preprocessor_config.json"
 1871        if preprocessor_config_path.is_file():
 1872            with open(preprocessor_config_path, "r", encoding="utf-8") as f:
 1873                cfg = json.load(f)
 1874                # move media_proc_cfg to root level for compat
 1875                if "media_proc_cfg" in cfg:
 1876                    cfg = {
 1877                        **cfg,
 1878                        **cfg["media_proc_cfg"],
 1879                    }
 1880                # merge configs
 1881                self.preprocessor_config = {**self.preprocessor_config, **cfg}
 1882
 1883        # prefer processor_config.json if possible
 1884        processor_config_path = self.dir_model / "processor_config.json"
 1885        if processor_config_path.is_file():
 1886            with open(processor_config_path, "r", encoding="utf-8") as f:
 1887                cfg = json.load(f)
 1888                # move image_processor to root level for compat
 1889                if "image_processor" in cfg:
 1890                    cfg = {
 1891                        **cfg,
 1892                        **cfg["image_processor"],
 1893                    }
 1894                # merge configs
 1895                self.preprocessor_config = {**self.preprocessor_config, **cfg}
 1896
 1897    def get_vision_config(self) -> dict[str, Any] | None:
 1898        config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
 1899        return self.global_config.get(config_name)
 1900
 1901    def get_audio_config(self) -> dict[str, Any] | None:
 1902        mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config"
 1903        return self.global_config.get(mm_config_key)
 1904
 1905    def set_type(self):
 1906        self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
 1907
 1908    def prepare_metadata(self, vocab_only: bool):
 1909        super().prepare_metadata(vocab_only=vocab_only)
 1910
 1911        output_type: str = self.ftype.name.partition("_")[2]
 1912
 1913        if self.fname_out.is_dir():
 1914            fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=output_type, model_type=None)
 1915            self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf"
 1916        else:
 1917            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
 1918
 1919    def set_gguf_parameters(self):
 1920        self.gguf_writer.add_file_type(self.ftype)
 1921
 1922        if self.has_vision_encoder:
 1923            self.gguf_writer.add_clip_has_vision_encoder(True)
 1924            self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
 1925
 1926            # vision config
 1927            self.image_size = self.find_vparam(["image_size"])
 1928            self.gguf_writer.add_vision_image_size(self.image_size)
 1929            self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
 1930            self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
 1931            self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
 1932            self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
 1933            self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))
 1934
 1935            # preprocessor config
 1936            image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
 1937            image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
 1938
 1939            self.gguf_writer.add_vision_image_mean(image_mean)
 1940            self.gguf_writer.add_vision_image_std(image_std)
 1941
 1942        if self.has_audio_encoder:
 1943            self.gguf_writer.add_clip_has_audio_encoder(True)
 1944            self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
 1945
 1946            # audio config
 1947            self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
 1948            self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
 1949            self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
 1950            self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
 1951
 1952        if not self.has_vision_encoder and not self.has_audio_encoder:
 1953            raise ValueError("MmprojModel must have either vision or audio encoder")
 1954
 1955    def write_vocab(self):
 1956        raise ValueError("MmprojModel does not support vocab writing")
 1957
 1958    def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
 1959        assert self.hparams_vision is not None
 1960        return self._find_param(self.hparams_vision, keys, optional)
 1961
 1962    def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
 1963        assert self.hparams_audio is not None
 1964        return self._find_param(self.hparams_audio, keys, optional)
 1965
 1966    def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
 1967        key = next((k for k in keys if k in obj), None)
 1968        if key is not None:
 1969            return obj[key]
 1970        if optional:
 1971            return None
 1972        raise KeyError(f"could not find any of: {keys}")
 1973
 1974    def tensor_force_quant(self, name, new_name, bid, n_dims):
 1975        del bid, name, n_dims  # unused
 1976        if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name:
 1977            return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
 1978        return False
 1979
 1980
 1981@ModelBase.register("GPTNeoXForCausalLM")
 1982class GPTNeoXModel(TextModel):
 1983    model_arch = gguf.MODEL_ARCH.GPTNEOX
 1984
 1985    def set_gguf_parameters(self):
 1986        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
 1987        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
 1988        self.gguf_writer.add_block_count(self.block_count)
 1989        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
 1990        self.gguf_writer.add_rope_dimension_count(
 1991            int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
 1992        )
 1993        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
 1994        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
 1995        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
 1996
 1997    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 1998        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
 1999        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
 2000
 2001        if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
 2002            # Map bloom-style qkv_linear to gpt-style qkv_linear
 2003            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
 2004            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
 2005            qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
 2006            data_torch = torch.cat(
 2007                (
 2008                    qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
 2009                    qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
 2010                    qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
 2011                ),
 2012                dim=0,
 2013            )
 2014            logger.info("re-format attention.linear_qkv.weight")
 2015        elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
 2016            qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
 2017            data_torch = torch.cat(
 2018                (
 2019                    qkv_bias[:, 0, :].reshape((n_embed,)),
 2020                    qkv_bias[:, 1, :].reshape((n_embed,)),
 2021                    qkv_bias[:, 2, :].reshape((n_embed,)),
 2022                ),
 2023                dim=0,
 2024            )
 2025            logger.info("re-format attention.linear_qkv.bias")
 2026
 2027        yield from super().modify_tensors(data_torch, name, bid)
 2028
 2029
 2030@ModelBase.register("BloomForCausalLM", "BloomModel")
 2031class BloomModel(TextModel):
 2032    model_arch = gguf.MODEL_ARCH.BLOOM
 2033
 2034    def set_gguf_parameters(self):
 2035        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
 2036        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
 2037        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
 2038        self.gguf_writer.add_embedding_length(n_embed)
 2039        self.gguf_writer.add_feed_forward_length(4 * n_embed)
 2040        self.gguf_writer.add_block_count(self.block_count)
 2041        self.gguf_writer.add_head_count(n_head)
 2042        self.gguf_writer.add_head_count_kv(n_head)
 2043        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
 2044        self.gguf_writer.add_file_type(self.ftype)
 2045
 2046    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2047        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
 2048        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
 2049
 2050        name = re.sub(r'transformer\.', '', name)
 2051
 2052        if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
 2053            # Map bloom-style qkv_linear to gpt-style qkv_linear
 2054            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
 2055            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
 2056            qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
 2057            data_torch = torch.cat(
 2058                (
 2059                    qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
 2060                    qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
 2061                    qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
 2062                ),
 2063                dim=0,
 2064            )
 2065            logger.info("re-format attention.linear_qkv.weight")
 2066        elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
 2067            qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
 2068            data_torch = torch.cat(
 2069                (
 2070                    qkv_bias[:, 0, :].reshape((n_embed,)),
 2071                    qkv_bias[:, 1, :].reshape((n_embed,)),
 2072                    qkv_bias[:, 2, :].reshape((n_embed,)),
 2073                ),
 2074                dim=0,
 2075            )
 2076            logger.info("re-format attention.linear_qkv.bias")
 2077
 2078        yield from super().modify_tensors(data_torch, name, bid)
 2079
 2080
 2081@ModelBase.register("MPTForCausalLM")
 2082class MPTModel(TextModel):
 2083    model_arch = gguf.MODEL_ARCH.MPT
 2084
 2085    def set_vocab(self):
 2086        try:
 2087            self._set_vocab_gpt2()
 2088        except Exception:
 2089            # Fallback for SEA-LION model
 2090            self._set_vocab_sentencepiece()
 2091            self.gguf_writer.add_add_bos_token(False)
 2092            self.gguf_writer.add_pad_token_id(3)
 2093            self.gguf_writer.add_eos_token_id(1)
 2094            self.gguf_writer.add_unk_token_id(0)
 2095
 2096    def set_gguf_parameters(self):
 2097        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
 2098        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
 2099        self.gguf_writer.add_block_count(self.block_count)
 2100        self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
 2101        self.gguf_writer.add_head_count(self.hparams["n_heads"])
 2102        if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
 2103            self.gguf_writer.add_head_count_kv(kv_n_heads)
 2104        self.gguf_writer.add_layer_norm_eps(1e-5)
 2105        if self.hparams["attn_config"]["clip_qkv"] is not None:
 2106            self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
 2107        if self.hparams["attn_config"]["alibi"]:
 2108            self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
 2109        else:
 2110            self.gguf_writer.add_max_alibi_bias(0.0)
 2111
 2112    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2113        if "scales" in name:
 2114            new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales"))
 2115            new_name = new_name.replace("scales", "act.scales")
 2116        else:
 2117            new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias"))
 2118
 2119        yield from super().modify_tensors(data_torch, new_name, bid)
 2120
 2121
 2122@ModelBase.register("OrionForCausalLM")
 2123class OrionModel(TextModel):
 2124    model_arch = gguf.MODEL_ARCH.ORION
 2125
 2126    def set_vocab(self):
 2127        self._set_vocab_sentencepiece()
 2128
 2129    def set_gguf_parameters(self):
 2130        head_count = self.hparams["num_attention_heads"]
 2131        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
 2132
 2133        ctx_length = 0
 2134        if "max_sequence_length" in self.hparams:
 2135            ctx_length = self.hparams["max_sequence_length"]
 2136        elif "max_position_embeddings" in self.hparams:
 2137            ctx_length = self.hparams["max_position_embeddings"]
 2138        elif "model_max_length" in self.hparams:
 2139            ctx_length = self.hparams["model_max_length"]
 2140        else:
 2141            raise ValueError("gguf: can not find ctx length parameter.")
 2142
 2143        self.gguf_writer.add_file_type(self.ftype)
 2144        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
 2145        self.gguf_writer.add_context_length(ctx_length)
 2146        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
 2147        self.gguf_writer.add_block_count(self.block_count)
 2148        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
 2149        self.gguf_writer.add_head_count(head_count)
 2150        self.gguf_writer.add_head_count_kv(head_count_kv)
 2151        # note: config provides rms norm but it is actually layer norm
 2152        # ref:  https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
 2153        self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
 2154
 2155
 2156@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
 2157class BaichuanModel(TextModel):
 2158    model_arch = gguf.MODEL_ARCH.BAICHUAN
 2159
 2160    def set_vocab(self):
 2161        self._set_vocab_sentencepiece()
 2162
 2163    def set_gguf_parameters(self):
 2164        super().set_gguf_parameters()
 2165
 2166        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
 2167        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
 2168
 2169    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2170        head_count = self.hparams["num_attention_heads"]
 2171        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
 2172
 2173        if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight":
 2174            logger.info(f"Unpacking and permuting layer {bid}")
 2175            yield from [
 2176                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid),
 2177                    self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)),
 2178                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid),
 2179                    self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)),
 2180                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid),
 2181                    self._reverse_hf_part(data_torch, 2)),
 2182            ]
 2183        else:
 2184            yield from self.modify_tensors(data_torch, self.map_tensor_name(name), bid)
 2185
 2186    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
 2187        if n_kv_head is not None and n_head != n_kv_head:
 2188            n_head //= n_kv_head
 2189
 2190        return (
 2191            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
 2192            .swapaxes(1, 2)
 2193            .reshape(weights.shape)
 2194        )
 2195
 2196    def _reverse_hf_permute_part(
 2197        self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
 2198    ) -> Tensor:
 2199        r = weights.shape[0] // 3
 2200        return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
 2201
 2202    def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
 2203        r = weights.shape[0] // 3
 2204        return weights[r * n_part:r * n_part + r, ...]
 2205
 2206
 2207@ModelBase.register("XverseForCausalLM")
 2208class XverseModel(TextModel):
 2209    model_arch = gguf.MODEL_ARCH.XVERSE
 2210
 2211    def set_vocab(self):
 2212        assert (self.dir_model / "tokenizer.json").is_file()
 2213        dir_model = self.dir_model
 2214        hparams = self.hparams
 2215
 2216        tokens: list[bytes] = []
 2217        toktypes: list[int] = []
 2218
 2219        from transformers import AutoTokenizer
 2220        tokenizer = AutoTokenizer.from_pretrained(dir_model)
 2221        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 2222        # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
 2223        # because vocab_size is the count of items, and indexes start at 0.
 2224        max_vocab_index = max(tokenizer.get_vocab().values())
 2225        if max_vocab_index >= vocab_size:
 2226            raise ValueError("Vocabulary size exceeds expected maximum size.")
 2227
 2228        reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
 2229        added_vocab = tokenizer.get_added_vocab()
 2230
 2231        for token_id in range(vocab_size):
 2232            token_text = reverse_vocab[token_id].encode('utf-8')
 2233            # replace "\x00" to string with length > 0
 2234            if token_text == b"\x00":
 2235                toktype = gguf.TokenType.BYTE  # special
 2236                token_text = f"<{token_text}>".encode('utf-8')
 2237            elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
 2238                toktype = gguf.TokenType.BYTE  # special
 2239            elif reverse_vocab[token_id] in added_vocab:
 2240                if tokenizer.added_tokens_decoder[token_id].special:
 2241                    toktype = gguf.TokenType.CONTROL
 2242                else:
 2243                    toktype = gguf.TokenType.USER_DEFINED
 2244            else:
 2245                toktype = gguf.TokenType.NORMAL
 2246
 2247            tokens.append(token_text)
 2248            toktypes.append(toktype)
 2249
 2250        self.gguf_writer.add_tokenizer_model("llama")
 2251        self.gguf_writer.add_tokenizer_pre("default")
 2252        self.gguf_writer.add_token_list(tokens)
 2253        self.gguf_writer.add_token_types(toktypes)
 2254
 2255        special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
 2256        special_vocab.add_to_gguf(self.gguf_writer)
 2257
 2258    def set_gguf_parameters(self):
 2259        super().set_gguf_parameters()
 2260
 2261        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
 2262        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
 2263
 2264    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2265        head_count = self.hparams["num_attention_heads"]
 2266        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
 2267
 2268        # HF models permute some of the tensors, so we need to undo that
 2269        if name.endswith("q_proj.weight"):
 2270            data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
 2271        if name.endswith("k_proj.weight"):
 2272            data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
 2273
 2274        yield from super().modify_tensors(data_torch, name, bid)
 2275
 2276    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
 2277        if n_kv_head is not None and n_head != n_kv_head:
 2278            n_head //= n_kv_head
 2279
 2280        return (
 2281            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
 2282            .swapaxes(1, 2)
 2283            .reshape(weights.shape)
 2284        )
 2285
 2286
 2287@ModelBase.register("FalconForCausalLM", "RWForCausalLM")
 2288class FalconModel(TextModel):
 2289    model_arch = gguf.MODEL_ARCH.FALCON
 2290
 2291    def set_gguf_parameters(self):
 2292        n_head = self.hparams.get("num_attention_heads")
 2293        if n_head is None:
 2294            n_head = self.hparams["n_head"]  # old name
 2295
 2296        n_head_kv = self.hparams.get("num_kv_heads")
 2297        if n_head_kv is None:
 2298            n_head_kv = self.hparams.get("n_head_kv", 1)  # old name
 2299
 2300        self.gguf_writer.add_context_length(2048)  # not in config.json
 2301        self.gguf_writer.add_tensor_data_layout("jploski")  # qkv tensor transform
 2302        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
 2303        self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
 2304        self.gguf_writer.add_block_count(self.block_count)
 2305        self.gguf_writer.add_head_count(n_head)
 2306        self.gguf_writer.add_head_count_kv(n_head_kv)
 2307        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
 2308        self.gguf_writer.add_file_type(self.ftype)
 2309
 2310    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2311        # QKV tensor transform
 2312        # The original query_key_value tensor contains n_head_kv "kv groups",
 2313        # each consisting of n_head/n_head_kv query weights followed by one key
 2314        # and one value weight (shared by all query heads in the kv group).
 2315        # This layout makes it a big pain to work with in GGML.
 2316        # So we rearrange them here,, so that we have n_head query weights
 2317        # followed by n_head_kv key weights followed by n_head_kv value weights,
 2318        # in contiguous fashion.
 2319        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
 2320
 2321        if "query_key_value" in name:
 2322            n_head = self.find_hparam(["num_attention_heads", "n_head"])
 2323            n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1
 2324            head_dim = self.hparams["hidden_size"] // n_head
 2325
 2326            qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
 2327            q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
 2328            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
 2329            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
 2330            data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
 2331
 2332        yield from super().modify_tensors(data_torch, name, bid)
 2333
 2334
 2335@ModelBase.register("GPTBigCodeForCausalLM")
 2336class StarCoderModel(TextModel):
 2337    model_arch = gguf.MODEL_ARCH.STARCODER
 2338
 2339    def set_gguf_parameters(self):
 2340        self.gguf_writer.add_context_length(self.hparams["n_positions"])
 2341        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
 2342        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
 2343        self.gguf_writer.add_block_count(self.block_count)
 2344        self.gguf_writer.add_head_count(self.hparams["n_head"])
 2345        self.gguf_writer.add_head_count_kv(1)
 2346        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
 2347        self.gguf_writer.add_file_type(self.ftype)
 2348
 2349
 2350@ModelBase.register("GPTRefactForCausalLM")
 2351class RefactModel(TextModel):
 2352    model_arch = gguf.MODEL_ARCH.REFACT
 2353
 2354    def set_vocab(self):
 2355        super().set_vocab()
 2356
 2357        # TODO: how to determine special FIM tokens automatically?
 2358        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
 2359                                          special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
 2360        special_vocab._set_special_token("prefix", 1)
 2361        special_vocab._set_special_token("suffix", 3)
 2362        special_vocab._set_special_token("middle", 2)
 2363        special_vocab.chat_template = None  # do not add it twice
 2364        special_vocab.add_to_gguf(self.gguf_writer)
 2365
 2366    def set_gguf_parameters(self):
 2367        hidden_dim = self.hparams["n_embd"]
 2368        inner_dim = 4 * hidden_dim
 2369        hidden_dim = int(2 * inner_dim / 3)
 2370        multiple_of = 256
 2371        ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
 2372
 2373        # refact uses Alibi. So this is from config.json which might be used by training.
 2374        self.gguf_writer.add_context_length(self.hparams["n_positions"])
 2375        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
 2376
 2377        self.gguf_writer.add_feed_forward_length(ff_dim)
 2378        self.gguf_writer.add_block_count(self.block_count)
 2379        self.gguf_writer.add_head_count(self.hparams["n_head"])
 2380        self.gguf_writer.add_head_count_kv(1)
 2381        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
 2382        self.gguf_writer.add_file_type(self.ftype)
 2383
 2384    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2385        hidden_dim = self.hparams["n_embd"]
 2386        inner_dim = 4 * hidden_dim
 2387        hidden_dim = int(2 * inner_dim / 3)
 2388        multiple_of = 256
 2389        ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
 2390        n_head = self.hparams["n_head"]
 2391        n_head_kv = 1
 2392        head_dim = self.hparams["n_embd"] // n_head
 2393
 2394        if bid is not None:
 2395            if name == f"transformer.h.{bid}.attn.kv.weight":
 2396                yield from super().modify_tensors(data_torch[:n_head_kv * head_dim], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid)
 2397                yield from super().modify_tensors(data_torch[n_head_kv * head_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid)
 2398                return
 2399            if name == f"transformer.h.{bid}.attn.q.weight":
 2400                yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid)
 2401                return
 2402            if name == f"transformer.h.{bid}.mlp.gate_up_proj.weight":
 2403                yield from super().modify_tensors(data_torch[:ff_dim], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid)
 2404                yield from super().modify_tensors(data_torch[ff_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid)
 2405                return
 2406
 2407        yield from super().modify_tensors(data_torch, name, bid)
 2408
 2409
 2410@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
 2411class StableLMModel(TextModel):
 2412    model_arch = gguf.MODEL_ARCH.STABLELM
 2413
 2414    def set_vocab(self):
 2415        if (self.dir_model / "tokenizer.json").is_file():
 2416            self._set_vocab_gpt2()
 2417        else:
 2418            # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
 2419            self._set_vocab_qwen()
 2420
 2421    def set_gguf_parameters(self):
 2422        hparams = self.hparams
 2423
 2424        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
 2425        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
 2426        self.gguf_writer.add_block_count(self.block_count)
 2427        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 2428        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
 2429        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
 2430        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
 2431        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
 2432        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
 2433        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
 2434        self.gguf_writer.add_file_type(self.ftype)
 2435
 2436    _q_norms: list[dict[str, Tensor]] | None = None
 2437    _k_norms: list[dict[str, Tensor]] | None = None
 2438
 2439    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2440        n_head = self.hparams["num_attention_heads"]
 2441        n_kv_head = self.hparams["num_key_value_heads"]
 2442
 2443        if name.find("q_layernorm.norms") != -1:
 2444            assert bid is not None
 2445
 2446            if self._q_norms is None:
 2447                self._q_norms = [{} for _ in range(self.block_count)]
 2448
 2449            self._q_norms[bid][name] = data_torch
 2450
 2451            if len(self._q_norms[bid]) >= n_head:
 2452                return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm")
 2453            else:
 2454                return
 2455
 2456        if name.find("k_layernorm.norms") != -1:
 2457            assert bid is not None
 2458
 2459            if self._k_norms is None:
 2460                self._k_norms = [{} for _ in range(self.block_count)]
 2461
 2462            self._k_norms[bid][name] = data_torch
 2463
 2464            if len(self._k_norms[bid]) >= n_kv_head:
 2465                return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm")
 2466            else:
 2467                return
 2468
 2469        yield from super().modify_tensors(data_torch, name, bid)
 2470
 2471    def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"):
 2472        datas: list[Tensor] = []
 2473        # extract the norms in order
 2474        for xid in range(n_head):
 2475            ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
 2476            datas.append(norms[ename])
 2477            del norms[ename]
 2478        data_torch = torch.stack(datas, dim=0)
 2479
 2480        merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
 2481
 2482        yield from super().modify_tensors(data_torch, merged_name, bid)
 2483
 2484    def prepare_tensors(self):
 2485        super().prepare_tensors()
 2486
 2487        if self._q_norms is not None or self._k_norms is not None:
 2488            # flatten two `list[dict[str, Tensor]]` into a single `list[str]`
 2489            norms = (
 2490                [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else []
 2491            ) + (
 2492                [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else []
 2493            )
 2494            if len(norms) > 0:
 2495                raise ValueError(f"Unprocessed norms: {norms}")
 2496
 2497
 2498@ModelBase.register(
 2499    "LLaMAForCausalLM",
 2500    "LlamaForCausalLM",
 2501    "MistralForCausalLM",
 2502    "MixtralForCausalLM",
 2503    "VLlama3ForCausalLM",
 2504    "LlavaForConditionalGeneration",
 2505    "VoxtralForConditionalGeneration",
 2506    "IQuestCoderForCausalLM",
 2507    "LlamaModel")
 2508class LlamaModel(TextModel):
 2509    model_arch = gguf.MODEL_ARCH.LLAMA
 2510    undo_permute = True
 2511
 2512    def __init__(self, *args, **kwargs):
 2513        super().__init__(*args, **kwargs)
 2514        # fix for SmolVLM2, missing `num_attention_heads` in config.json
 2515        if self.hf_arch == "VLlama3ForCausalLM":
 2516            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
 2517        hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
 2518        self.origin_hf_arch = hparams.get('architectures', [None])[0]
 2519
 2520    def set_vocab(self):
 2521        if self.origin_hf_arch == "GlmasrModel":
 2522            return self._set_vocab_glmedge()
 2523
 2524        if self.is_mistral_format:
 2525            return self._set_vocab_mistral()
 2526
 2527        path_tekken_json = self.dir_model / "tekken.json"
 2528        path_tokenizer_json = self.dir_model / "tokenizer.json"
 2529        if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
 2530            self._set_vocab_mistral()
 2531
 2532        try:
 2533            self._set_vocab_sentencepiece()
 2534        except FileNotFoundError:
 2535            try:
 2536                self._set_vocab_llama_hf()
 2537            except (FileNotFoundError, TypeError):
 2538                # Llama 3
 2539                self._set_vocab_gpt2()
 2540
 2541        # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
 2542        if self.hparams.get("vocab_size", 32000) == 32016:
 2543            special_vocab = gguf.SpecialVocab(
 2544                self.dir_model, load_merges=False,
 2545                special_token_types = ['prefix', 'suffix', 'middle', 'eot']
 2546            )
 2547            special_vocab._set_special_token("prefix", 32007)
 2548            special_vocab._set_special_token("suffix", 32008)
 2549            special_vocab._set_special_token("middle", 32009)
 2550            special_vocab._set_special_token("eot",    32010)
 2551            special_vocab.add_to_gguf(self.gguf_writer)
 2552
 2553        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
 2554        if tokenizer_config_file.is_file():
 2555            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
 2556                tokenizer_config_json = json.load(f)
 2557                if "add_prefix_space" in tokenizer_config_json:
 2558                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
 2559
 2560        # Apply to granite small models only
 2561        if self.hparams.get("vocab_size", 32000) == 49152:
 2562            self.gguf_writer.add_add_bos_token(False)
 2563
 2564    def set_gguf_parameters(self):
 2565        super().set_gguf_parameters()
 2566        hparams = self.hparams
 2567
 2568        if not self.is_mistral_format:
 2569            self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 2570
 2571        if (rope_dim := hparams.get("head_dim")) is None:
 2572            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
 2573        self.gguf_writer.add_rope_dimension_count(rope_dim)
 2574
 2575    @staticmethod
 2576    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
 2577        if n_head_kv is not None and n_head != n_head_kv:
 2578            n_head = n_head_kv
 2579        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
 2580                .swapaxes(1, 2)
 2581                .reshape(weights.shape))
 2582
 2583    _experts: list[dict[str, Tensor]] | None = None
 2584
 2585    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2586        n_head = self.find_hparam(["n_heads", "num_attention_heads"])
 2587        n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
 2588
 2589        vision_prefixes = [
 2590            "vision_encoder.",
 2591            "vision_language_adapter.",
 2592            "patch_merger.",
 2593            "pre_mm_projector_norm",
 2594            "audio_encoder.",
 2595        ]
 2596
 2597        is_multimodal_tensor = "vision_tower" in name \
 2598            or "vision_model" in name \
 2599            or "audio_tower" in name \
 2600            or "model.connector" in name \
 2601            or "multi_modal_projector" in name \
 2602            or any(
 2603                name.startswith(prefix)
 2604                for prefix in vision_prefixes
 2605            )
 2606
 2607        if is_multimodal_tensor:
 2608            return  # skip vision tensors
 2609        elif self.hf_arch == "LlamaModel":
 2610            name = "model." + name
 2611        elif name.startswith("model.text_model"):
 2612            name = name.replace("text_model.", "") # for SmolVLM
 2613        elif name.startswith("language_model."):
 2614            name = name.replace("language_model.", "") # for the rest
 2615
 2616        if self.undo_permute:
 2617            if name.endswith(("q_proj.weight", "q_proj.bias")):
 2618                data_torch = LlamaModel.permute(data_torch, n_head, n_head)
 2619            if name.endswith(("k_proj.weight", "k_proj.bias")):
 2620                data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 2621
 2622        # process the experts separately
 2623        if name.find("block_sparse_moe.experts") != -1:
 2624            n_experts = self.hparams["num_local_experts"]
 2625
 2626            assert bid is not None
 2627
 2628            if self._experts is None:
 2629                self._experts = [{} for _ in range(self.block_count)]
 2630
 2631            self._experts[bid][name] = data_torch
 2632
 2633            if len(self._experts[bid]) >= n_experts * 3:
 2634                # merge the experts into a single 3d tensor
 2635                for wid in ["w1", "w2", "w3"]:
 2636                    datas: list[Tensor] = []
 2637
 2638                    for xid in range(n_experts):
 2639                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
 2640                        datas.append(self._experts[bid][ename])
 2641                        del self._experts[bid][ename]
 2642
 2643                    data_torch = torch.stack(datas, dim=0)
 2644
 2645                    merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
 2646
 2647                    yield from super().modify_tensors(data_torch, merged_name, bid)
 2648                return
 2649            else:
 2650                return
 2651
 2652        yield from super().modify_tensors(data_torch, name, bid)
 2653
 2654    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 2655        if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
 2656            if rope_params.get("rope_type", '').lower() == "llama3":
 2657                base = rope_params.get("rope_theta", 10000.0)
 2658                if (dim := self.hparams.get("head_dim")) is None:
 2659                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 2660                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
 2661
 2662                factor = rope_params.get("factor", 8.0)
 2663                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
 2664                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
 2665                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
 2666
 2667                low_freq_wavelen = old_context_len / low_freq_factor
 2668                high_freq_wavelen = old_context_len / high_freq_factor
 2669                # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4
 2670
 2671                rope_factors = []
 2672                for freq in freqs:
 2673                    wavelen = 2 * math.pi / freq
 2674                    if wavelen < high_freq_wavelen:
 2675                        rope_factors.append(1)
 2676                    elif wavelen > low_freq_wavelen:
 2677                        rope_factors.append(factor)
 2678                    else:
 2679                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
 2680                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
 2681
 2682                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
 2683
 2684    def prepare_tensors(self):
 2685        super().prepare_tensors()
 2686
 2687        if self._experts is not None:
 2688            # flatten `list[dict[str, Tensor]]` into `list[str]`
 2689            experts = [k for d in self._experts for k in d.keys()]
 2690            if len(experts) > 0:
 2691                raise ValueError(f"Unprocessed experts: {experts}")
 2692
 2693
 2694@ModelBase.register("ArceeForCausalLM")
 2695class ArceeModel(LlamaModel):
 2696    model_arch = gguf.MODEL_ARCH.ARCEE
 2697
 2698    def set_gguf_parameters(self):
 2699        super().set_gguf_parameters()
 2700        self._try_set_pooling_type()
 2701
 2702
 2703@ModelBase.register("AfmoeForCausalLM")
 2704class AfmoeModel(LlamaModel):
 2705    model_arch = gguf.MODEL_ARCH.AFMOE
 2706
 2707    def set_gguf_parameters(self):
 2708        super().set_gguf_parameters()
 2709
 2710        # MoE parameters
 2711        if (n_experts := self.hparams.get("num_experts")) is not None:
 2712            self.gguf_writer.add_expert_count(n_experts)
 2713        if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None:
 2714            self.gguf_writer.add_expert_shared_count(n_shared_experts)
 2715        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
 2716            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
 2717        if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None:
 2718            self.gguf_writer.add_leading_dense_block_count(n_dense_layers)
 2719
 2720        # Route normalization and scaling
 2721        if (route_norm := self.hparams.get("route_norm")) is not None:
 2722            self.gguf_writer.add_expert_weights_norm(route_norm)
 2723        if (route_scale := self.hparams.get("route_scale")) is not None:
 2724            self.gguf_writer.add_expert_weights_scale(route_scale)
 2725
 2726        # Sliding window attention
 2727        if (sliding_window := self.hparams.get("sliding_window")) is not None:
 2728            self.gguf_writer.add_sliding_window(sliding_window)
 2729
 2730    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2731        # Handle expert weights - they're already merged in the HF format
 2732        # process the experts separately
 2733        if name.find("mlp.experts") != -1:
 2734            n_experts = self.hparams["num_experts"]
 2735            assert bid is not None
 2736
 2737            if self._experts is None:
 2738                self._experts = [{} for _ in range(self.block_count)]
 2739
 2740            self._experts[bid][name] = data_torch
 2741
 2742            if len(self._experts[bid]) >= n_experts * 3:
 2743                # merge the experts into a single 3d tensor
 2744                for w_name in ["gate_proj", "up_proj", "down_proj"]:
 2745                    datas: list[Tensor] = []
 2746
 2747                    for xid in range(n_experts):
 2748                        ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 2749                        datas.append(self._experts[bid][ename_to_retrieve])
 2750                        del self._experts[bid][ename_to_retrieve]
 2751
 2752                    data_torch = torch.stack(datas, dim=0)
 2753                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 2754                    yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid)
 2755
 2756                return
 2757            else:
 2758                return
 2759
 2760        if name.endswith(".expert_bias"):
 2761            name = name.replace(".expert_bias", ".expert_bias.bias")
 2762
 2763        yield from ModelBase.modify_tensors(self, data_torch, name, bid)
 2764
 2765
 2766@ModelBase.register(
 2767    "LlavaForConditionalGeneration", # pixtral
 2768    "Mistral3ForConditionalGeneration", # mistral small 3.1
 2769)
 2770class LlavaVisionModel(MmprojModel):
 2771    img_break_tok_id = -1
 2772    use_break_tok = True
 2773
 2774    def __init__(self, *args, **kwargs):
 2775        super().__init__(*args, **kwargs)
 2776        if self.hparams.get("model_type") == "pixtral":
 2777            # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
 2778            self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
 2779            if self.use_break_tok:
 2780                self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
 2781        elif self.is_mistral_format:
 2782            # hparams is already vision config here so norm_eps is only defined in global_config.
 2783            self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
 2784            assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
 2785            if self.use_break_tok:
 2786                self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
 2787        else:
 2788            raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
 2789        logger.info(f"Image break token id: {self.img_break_tok_id}")
 2790
 2791    def get_token_id(self, token: str) -> int:
 2792        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
 2793        with open(tokenizer_config_file, "r", encoding="utf-8") as f:
 2794            added_tokens_decoder = json.load(f)['added_tokens_decoder']
 2795            for id_, token_data in added_tokens_decoder.items():
 2796                if token_data["content"] == token:
 2797                    return int(id_)
 2798        raise ValueError(f"Token '{token}' not found in tokenizer config.")
 2799
 2800    def set_gguf_parameters(self):
 2801        super().set_gguf_parameters()
 2802        hparams = self.hparams
 2803        if hparams.get("model_type") == "pixtral":
 2804            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
 2805            self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
 2806
 2807            # hidden_act
 2808            if hparams["hidden_act"] == "silu":
 2809                self.gguf_writer.add_vision_use_silu(True)
 2810            elif hparams["hidden_act"] == "gelu":
 2811                self.gguf_writer.add_vision_use_gelu(True)
 2812            else:
 2813                raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
 2814
 2815            # spatial_merge_size
 2816            if "spatial_merge_size" in self.global_config:
 2817                self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"])
 2818
 2819    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2820        n_head = (
 2821            self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"])
 2822        )
 2823        n_kv_head = n_head
 2824
 2825        valid_prefixes = (
 2826            "multi_modal_projector.",
 2827            "vision_tower.",
 2828            "vision_encoder.",
 2829            "vision_language_adapter.",
 2830            "patch_merger.",
 2831            "pre_mm_projector_norm",
 2832        )
 2833
 2834        if any(name.startswith(prefix) for prefix in valid_prefixes):
 2835            # process vision tensors
 2836            if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format:
 2837                data_torch = LlamaModel.permute(data_torch, n_head, n_head)
 2838            if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format:
 2839                data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 2840            yield from super().modify_tensors(data_torch, name, bid)
 2841            return
 2842
 2843        embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight"
 2844        if self.img_break_tok_id > 0 and embed_key in name:
 2845            logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
 2846            # for pixtral model, we need to extract the [IMG_BREAK] token embedding
 2847            img_break_embd = data_torch[self.img_break_tok_id]
 2848            name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK]
 2849            yield from super().modify_tensors(img_break_embd, name, bid)
 2850
 2851        return # skip other tensors
 2852
 2853
 2854@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
 2855class SmolVLMModel(MmprojModel):
 2856    def __init__(self, *args, **kwargs):
 2857        super().__init__(*args, **kwargs)
 2858        if self.hparams["model_type"] == "smolvlm_vision":
 2859            # fix for SmolVLM2, missing some keys in config.json
 2860            # default values are taken from transformers code
 2861            self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
 2862            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
 2863            self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
 2864
 2865    def set_gguf_parameters(self):
 2866        super().set_gguf_parameters()
 2867        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3)
 2868        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
 2869        self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
 2870        self.gguf_writer.add_vision_use_gelu(True)
 2871
 2872        # Add the preprocessor longest edge size
 2873        preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
 2874        self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
 2875
 2876    def tensor_force_quant(self, name, new_name, bid, n_dims):
 2877        if ".embeddings." in name:
 2878            return gguf.GGMLQuantizationType.F32
 2879        return super().tensor_force_quant(name, new_name, bid, n_dims)
 2880
 2881    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2882        is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
 2883
 2884        if is_vision_tensor:
 2885            yield from super().modify_tensors(data_torch, name, bid)
 2886
 2887        return # skip other tensors
 2888
 2889
 2890@ModelBase.register(
 2891    "Llama4ForConditionalGeneration",
 2892    "Llama4ForCausalLM",
 2893)
 2894class Llama4Model(LlamaModel):
 2895    model_arch = gguf.MODEL_ARCH.LLAMA4
 2896    undo_permute = False
 2897
 2898    def __init__(self, *args, **kwargs):
 2899        super().__init__(*args, **kwargs)
 2900        # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
 2901        self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
 2902        self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
 2903
 2904    def set_vocab(self):
 2905        self._set_vocab_gpt2()
 2906
 2907    def set_gguf_parameters(self):
 2908        super().set_gguf_parameters()
 2909        self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
 2910        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
 2911        if "layer_types" in self.hparams:
 2912            if all(lt == "full_attention" for lt in self.hparams["layer_types"]):
 2913                # all layers are full attention (for MobileLLM), disable swa
 2914                self.gguf_writer.add_sliding_window(0)
 2915
 2916    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
 2917        if name.startswith("language_model."):
 2918            name = name.replace("language_model.", "")
 2919
 2920        # split the gate_up into gate and up
 2921        if "gate_up_proj" in name:
 2922            name_up = name.replace("gate_up_proj", "up_proj.weight")
 2923            name_gate = name.replace("gate_up_proj", "gate_proj.weight")
 2924            dim_half = data_torch.shape[-1] // 2
 2925            gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2)
 2926            yield from super().modify_tensors(gate_proj_weight, name_gate, bid)
 2927            yield from super().modify_tensors(up_proj_weight, name_up, bid)
 2928            return
 2929
 2930        if name.endswith("down_proj"):
 2931            name += ".weight"
 2932            data_torch = data_torch.transpose(-1, -2)
 2933
 2934        if "multi_modal_projector" in name or "vision_model" in name:
 2935            return
 2936        yield from super().modify_tensors(data_torch, name, bid)
 2937
 2938
 2939@ModelBase.register("Llama4ForConditionalGeneration")
 2940class Llama4VisionModel(MmprojModel):
 2941    def set_gguf_parameters(self):
 2942        super().set_gguf_parameters()
 2943        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4)
 2944        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
 2945        self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
 2946        assert self.hparams["hidden_act"] == "gelu"
 2947        self.gguf_writer.add_vision_use_gelu(True)
 2948
 2949    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 2950        if "multi_modal_projector" in name or "vision_model" in name:
 2951            # process vision tensors
 2952            if "positional_embedding_vlm" in name and ".weight" not in name:
 2953                name += ".weight"
 2954            if "multi_modal_projector.linear_1" in name:
 2955                # despite the name with number postfix, this is a single fully connected layer
 2956                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)
 2957            else:
 2958                yield from super().modify_tensors(data_torch, name, bid)
 2959
 2960
 2961@ModelBase.register(
 2962    "Mistral3ForConditionalGeneration",
 2963    "Ministral3ForCausalLM",
 2964)
 2965class Mistral3Model(LlamaModel):
 2966    model_arch = gguf.MODEL_ARCH.MISTRAL3
 2967
 2968    def __init__(self, *args, **kwargs):
 2969        super().__init__(*args, **kwargs)
 2970        # for compatibility, we use LLAMA arch for older models
 2971        # TODO: remove this once everyone has migrated to newer version of llama.cpp
 2972        if self.hparams.get("model_type") != "ministral3":
 2973            self.model_arch = gguf.MODEL_ARCH.LLAMA
 2974            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
 2975            self.gguf_writer.add_architecture()
 2976            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
 2977
 2978    def set_gguf_parameters(self):
 2979        super().set_gguf_parameters()
 2980        rope_params = self.rope_parameters
 2981        if self.hparams.get("model_type") == "ministral3":
 2982            assert rope_params, "ministral3 must have 'rope_parameters' config"
 2983            assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
 2984            self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
 2985            self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
 2986
 2987    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
 2988        name = name.replace("language_model.", "")
 2989        if "multi_modal_projector" in name or "vision_tower" in name:
 2990            return
 2991
 2992        yield from super().modify_tensors(data_torch, name, bid)
 2993
 2994
 2995@ModelBase.register("DeciLMForCausalLM")
 2996class DeciModel(TextModel):
 2997    model_arch = gguf.MODEL_ARCH.DECI
 2998
 2999    @staticmethod
 3000    def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
 3001        # DeciLM-specific code
 3002        intermediate_size = int(2 * ffn_mult * n_embd / 3)
 3003        return DeciModel._find_multiple(intermediate_size, 256)
 3004
 3005    @staticmethod
 3006    def _find_multiple(n: int, k: int) -> int:
 3007        # DeciLM-specific code
 3008        if n % k == 0:
 3009            return n
 3010        return n + k - (n % k)
 3011
 3012    def __init__(self, *args, **kwargs):
 3013        super().__init__(*args, **kwargs)
 3014
 3015        if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
 3016            _block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
 3017            assert self.block_count == len(_block_configs)
 3018            self._num_kv_heads = list()
 3019            self._num_heads = list()
 3020            _ffn_multipliers = list()
 3021            # ***linear attention layer***
 3022            # if n_heads_in_group is None and replace_with_linear is True
 3023            # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
 3024            # ***attention-free layer***
 3025            # if n_heads_in_group is None and replace_with_linear is False
 3026            # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
 3027            # ***normal attention-layer***
 3028            # if n_heads_in_group is not None, then
 3029            # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
 3030            # _num_heads[il] is num_attention_head
 3031            # ***dummy layer*** for nemotron 253B
 3032            # if n_heads_in_group is None and ffn_mult is None
 3033            # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0
 3034            for il in range(len(_block_configs)):
 3035                if _block_configs[il]["attention"]["n_heads_in_group"] is None:
 3036                    if _block_configs[il]["attention"]["replace_with_linear"] is True:
 3037                        self._num_kv_heads.append(0)
 3038                        self._num_heads.append(self.hparams["num_attention_heads"])
 3039                    else:
 3040                        self._num_kv_heads.append(0)
 3041                        self._num_heads.append(0)
 3042                else:
 3043                    self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
 3044                    self._num_heads.append(self.hparams["num_attention_heads"])
 3045                if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer
 3046                    _ffn_multipliers.append(0.0)
 3047                else:
 3048                    _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
 3049            assert self.block_count == len(self._num_kv_heads)
 3050            assert self.block_count == len(self._num_heads)
 3051            assert self.block_count == len(_ffn_multipliers)
 3052            assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
 3053            assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
 3054            assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
 3055            self._ffn_dims: list[int] = [
 3056                DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
 3057                for multiplier in _ffn_multipliers
 3058            ]
 3059
 3060    def set_vocab(self):
 3061        # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
 3062        # eos_token from '|eot_id|' to '|end_of_text|'
 3063        if self.hparams.get("vocab_size", 128256) == 128256:
 3064            tokens, toktypes, tokpre = self.get_vocab_base()
 3065            self.gguf_writer.add_tokenizer_model("gpt2")
 3066            self.gguf_writer.add_tokenizer_pre(tokpre)
 3067            self.gguf_writer.add_token_list(tokens)
 3068            self.gguf_writer.add_token_types(toktypes)
 3069
 3070            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
 3071            special_vocab.add_to_gguf(self.gguf_writer)
 3072        else:
 3073            # DeciLM-7B
 3074            self._set_vocab_llama_hf()
 3075
 3076    def set_gguf_parameters(self):
 3077        if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
 3078            assert self.block_count == len(self._num_kv_heads)
 3079            assert self.block_count == len(self._num_heads)
 3080            assert self.block_count == len(self._ffn_dims)
 3081            if (rope_theta := self.rope_parameters.get("rope_theta")) is not None:
 3082                self.gguf_writer.add_rope_freq_base(rope_theta)
 3083            self.gguf_writer.add_head_count_kv(self._num_kv_heads)
 3084            self.gguf_writer.add_head_count(self._num_heads)
 3085            self.gguf_writer.add_feed_forward_length(self._ffn_dims)
 3086            self.gguf_writer.add_block_count(self.block_count)
 3087            self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
 3088            self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
 3089            self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
 3090            self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
 3091            self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
 3092            self.gguf_writer.add_file_type(self.ftype)
 3093        else: # DeciLM-7B
 3094            super().set_gguf_parameters()
 3095            if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
 3096                self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
 3097                assert self.block_count == len(self._num_kv_heads)
 3098                self.gguf_writer.add_head_count_kv(self._num_kv_heads)
 3099        hparams = self.hparams
 3100        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 3101
 3102        if (rope_dim := hparams.get("head_dim")) is None:
 3103            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
 3104        self.gguf_writer.add_rope_dimension_count(rope_dim)
 3105
 3106    @staticmethod
 3107    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
 3108        if n_head_kv is not None and n_head != n_head_kv:
 3109            n_head = n_head_kv
 3110        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
 3111                .swapaxes(1, 2)
 3112                .reshape(weights.shape))
 3113
 3114    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3115        n_head = self.hparams["num_attention_heads"]
 3116        if bid is not None:
 3117            if "num_key_value_heads_per_layer" in self.hparams:
 3118                n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
 3119            elif "block_configs" in self.hparams:
 3120                n_kv_head = self._num_kv_heads[bid]
 3121                n_head = self._num_heads[bid]
 3122            else:
 3123                n_kv_head = self.hparams.get("num_key_value_heads")
 3124        else:
 3125            n_kv_head = self.hparams.get("num_key_value_heads")
 3126
 3127        if name.endswith(("q_proj.weight", "q_proj.bias")):
 3128            data_torch = DeciModel.permute(data_torch, n_head, n_head)
 3129        if name.endswith(("k_proj.weight", "k_proj.bias")):
 3130            data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
 3131        yield from super().modify_tensors(data_torch, name, bid)
 3132
 3133    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 3134        if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
 3135            if rope_params.get("rope_type", '').lower() == "llama3":
 3136                base = rope_params.get("rope_theta", 10000.0)
 3137                if (dim := self.hparams.get("head_dim")) is None:
 3138                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 3139                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
 3140
 3141                factor = rope_params.get("factor", 8.0)
 3142                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
 3143                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
 3144                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
 3145
 3146                low_freq_wavelen = old_context_len / low_freq_factor
 3147                high_freq_wavelen = old_context_len / high_freq_factor
 3148                assert low_freq_wavelen != high_freq_wavelen
 3149
 3150                rope_factors = []
 3151                for freq in freqs:
 3152                    wavelen = 2 * math.pi / freq
 3153                    if wavelen < high_freq_wavelen:
 3154                        rope_factors.append(1)
 3155                    elif wavelen > low_freq_wavelen:
 3156                        rope_factors.append(factor)
 3157                    else:
 3158                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
 3159                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
 3160
 3161                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
 3162
 3163    def prepare_tensors(self):
 3164        super().prepare_tensors()
 3165
 3166
 3167@ModelBase.register("BitnetForCausalLM")
 3168class BitnetModel(TextModel):
 3169    model_arch = gguf.MODEL_ARCH.BITNET
 3170
 3171    def set_vocab(self):
 3172        self._set_vocab_sentencepiece()
 3173
 3174    def set_gguf_parameters(self):
 3175        super().set_gguf_parameters()
 3176        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
 3177        self.gguf_writer.add_rope_scaling_factor(1.0)
 3178
 3179    def weight_quant(self, weight: Tensor) -> Tensor:
 3180        dtype = weight.dtype
 3181        weight = weight.float()
 3182        scale = weight.abs().mean().clamp(min=1e-5)
 3183        iscale = 1 / scale
 3184        # TODO: multiply by the scale directly instead of inverting it twice
 3185        # (this is also unnecessarily doubly inverted upstream)
 3186        # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
 3187        result = (weight * iscale).round().clamp(-1, 1) / iscale
 3188        return result.type(dtype)
 3189
 3190    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3191        new_name = self.map_tensor_name(name)
 3192
 3193        if any(self.match_model_tensor_name(new_name, key, bid) for key in [
 3194            gguf.MODEL_TENSOR.ATTN_Q,
 3195            gguf.MODEL_TENSOR.ATTN_K,
 3196            gguf.MODEL_TENSOR.ATTN_V,
 3197            gguf.MODEL_TENSOR.ATTN_OUT,
 3198            gguf.MODEL_TENSOR.FFN_UP,
 3199            gguf.MODEL_TENSOR.FFN_DOWN,
 3200            gguf.MODEL_TENSOR.FFN_GATE,
 3201        ]):
 3202            # transform weight into 1/0/-1 (in fp32)
 3203            data_torch = self.weight_quant(data_torch)
 3204
 3205        yield from super().modify_tensors(data_torch, name, bid)
 3206
 3207
 3208@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM")
 3209class GrokModel(TextModel):
 3210    model_arch = gguf.MODEL_ARCH.GROK
 3211
 3212    def set_vocab(self):
 3213        if (self.dir_model / 'tokenizer.model').is_file():
 3214            self._set_vocab_sentencepiece()
 3215            return
 3216
 3217        if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file():
 3218            logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer')
 3219            sys.exit(1)
 3220
 3221        self._set_vocab_gpt2()
 3222
 3223    def __init__(self, *args, **kwargs):
 3224        super().__init__(*args, **kwargs)
 3225
 3226    def set_gguf_parameters(self):
 3227        super().set_gguf_parameters()
 3228
 3229        self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0))
 3230        self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0))
 3231        if (final_logit_softcap := self.hparams.get("final_logit_softcapping")):
 3232            self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
 3233
 3234        if (rope_dim := self.hparams.get("head_dim")) is None:
 3235            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 3236
 3237        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
 3238            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
 3239
 3240        # Treat "original" as "yarn", seems to have been a mistake
 3241        if self.hparams.get("rope_type") in ("yarn", "original"):
 3242            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
 3243            self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"])
 3244            self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"])
 3245            self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"])
 3246            self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"])
 3247            self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"])
 3248            self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"])
 3249
 3250        if temp_len := self.hparams.get("attn_temperature_len"):
 3251            self.gguf_writer.add_attn_temperature_length(temp_len)
 3252
 3253        self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5))
 3254        self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"])
 3255        self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"])
 3256
 3257    _experts: list[dict[str, list[Tensor]]] | None = None
 3258    _cur_expert = ""
 3259
 3260    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3261        deferred: list[tuple[Tensor, str, int | None]] = []
 3262        is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
 3263
 3264        if not is_expert:
 3265            deferred.append((data_torch, name, bid))
 3266
 3267        # process the experts separately
 3268        if is_expert or self._cur_expert:
 3269            n_experts = self.hparams["num_local_experts"]
 3270
 3271            assert bid is not None
 3272
 3273            if self._experts is None:
 3274                self._experts = [{} for _ in range(self.block_count)]
 3275
 3276            # concatenate split tensors
 3277            if name in self._experts[bid]:
 3278                self._cur_expert = name
 3279                self._experts[bid][name].append(data_torch)
 3280                return
 3281            elif is_expert:
 3282                self._cur_expert = name
 3283                self._experts[bid][name] = [data_torch]
 3284                return
 3285            else:
 3286                self._cur_expert = ""
 3287
 3288            for bid in range(self.block_count):
 3289                if len(self._experts[bid]) >= n_experts * 3:
 3290                    # merge the experts into a single 3d tensor
 3291                    for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]:
 3292                        datas: list[Tensor] = []
 3293
 3294                        for xid in range(n_experts):
 3295                            ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
 3296                            if ename not in self._experts[bid]:
 3297                                ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
 3298                            tensor_list = self._experts[bid][ename]
 3299                            datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0])
 3300                            del self._experts[bid][ename]
 3301
 3302                        data_torch = torch.stack(datas, dim=0)
 3303
 3304                        merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight"
 3305
 3306                        yield from super().modify_tensors(data_torch, merged_name, bid)
 3307
 3308        for t in deferred:
 3309            yield from super().modify_tensors(*t)
 3310
 3311
 3312@ModelBase.register("DbrxForCausalLM")
 3313class DbrxModel(TextModel):
 3314    model_arch = gguf.MODEL_ARCH.DBRX
 3315
 3316    def set_gguf_parameters(self):
 3317        ffn_config = self.hparams["ffn_config"]
 3318        attn_config = self.hparams["attn_config"]
 3319        self.gguf_writer.add_block_count(self.block_count)
 3320
 3321        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
 3322        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
 3323        self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
 3324
 3325        self.gguf_writer.add_head_count(self.hparams["n_heads"])
 3326        self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
 3327
 3328        self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
 3329
 3330        self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
 3331
 3332        self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
 3333        self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
 3334
 3335        self.gguf_writer.add_layer_norm_eps(1e-5)
 3336
 3337        self.gguf_writer.add_file_type(self.ftype)
 3338        logger.info(f"gguf: file type = {self.ftype}")
 3339
 3340    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3341        n_expert = self.hparams["ffn_config"]["moe_num_experts"]
 3342        n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
 3343        n_embd = self.hparams["d_model"]
 3344
 3345        # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
 3346        # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
 3347        # But llama.cpp moe graph works differently
 3348        # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
 3349        # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
 3350        exp_tensor_names = {"ffn.experts.mlp.w1": None,       # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff,   n_expert}
 3351                            "ffn.experts.mlp.w2": (0, 2, 1),  # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff,   n_embd, n_expert}
 3352                            "ffn.experts.mlp.v1": None}       # LLM_TENSOR_FFN_UP_EXPS   ggml_tensor->ne{n_embd, n_ff,   n_expert}
 3353        experts = False
 3354
 3355        for exp_tensor_name in exp_tensor_names.keys():
 3356            if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
 3357                experts = True
 3358                data_torch = data_torch.view(n_expert, n_ff, n_embd)
 3359                if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
 3360                    data_torch = data_torch.permute(*permute_tensor)
 3361                break
 3362
 3363        # map tensor names
 3364        # In MoE models the ffn tensors are typically most of the model weights,
 3365        # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
 3366        # Every other model has the weight names ending in .weight,
 3367        # let's assume that is the convention which is not the case for dbrx:
 3368        # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
 3369        new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
 3370
 3371        yield from super().modify_tensors(data_torch, new_name, bid)
 3372
 3373    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
 3374        del name, new_name, bid  # unused
 3375
 3376        return n_dims > 1
 3377
 3378
 3379@ModelBase.register("MiniCPMForCausalLM")
 3380class MiniCPMModel(TextModel):
 3381    model_arch = gguf.MODEL_ARCH.MINICPM
 3382
 3383    def set_gguf_parameters(self):
 3384        super().set_gguf_parameters()
 3385        embedding_scale = float(self.hparams["scale_emb"])
 3386        self.gguf_writer.add_embedding_scale(embedding_scale)
 3387        logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
 3388        residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
 3389        self.gguf_writer.add_residual_scale(residual_scale)
 3390        logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
 3391        logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
 3392        self.gguf_writer.add_logit_scale(logit_scale)
 3393        logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
 3394
 3395    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 3396        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 3397
 3398        rope_scaling = self.find_hparam(['rope_scaling'], True)
 3399        if rope_scaling is not None:
 3400            long_factors = rope_scaling.get('long_factor', None)
 3401            short_factors = rope_scaling.get('short_factor', None)
 3402
 3403            if long_factors is None or short_factors is None:
 3404                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
 3405
 3406            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
 3407                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
 3408
 3409            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
 3410            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
 3411
 3412    def set_vocab(self):
 3413        self._set_vocab_sentencepiece()
 3414
 3415    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3416        n_head = self.hparams["num_attention_heads"]
 3417        n_kv_head = self.hparams.get("num_key_value_heads")
 3418
 3419        # HF models permute some of the tensors, so we need to undo that
 3420        if name.endswith(("q_proj.weight")):
 3421            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
 3422        if name.endswith(("k_proj.weight")):
 3423            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 3424
 3425        yield from super().modify_tensors(data_torch, name, bid)
 3426
 3427
 3428@ModelBase.register("MiniCPM3ForCausalLM")
 3429class MiniCPM3Model(TextModel):
 3430    model_arch = gguf.MODEL_ARCH.MINICPM3
 3431
 3432    def set_gguf_parameters(self):
 3433        hparams = self.hparams
 3434
 3435        self.gguf_writer.add_file_type(self.ftype)
 3436        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
 3437        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
 3438        self.gguf_writer.add_block_count(self.block_count)
 3439        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 3440        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
 3441        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
 3442        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 3443        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 3444        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
 3445            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
 3446        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
 3447        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
 3448        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
 3449
 3450    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 3451        rope_scaling = self.find_hparam(['rope_scaling'], True)
 3452        if rope_scaling is not None:
 3453            rope_dims = self.hparams["qk_rope_head_dim"]
 3454
 3455            long_factors = rope_scaling.get('long_factor', None)
 3456            short_factors = rope_scaling.get('short_factor', None)
 3457
 3458            if long_factors is None or short_factors is None:
 3459                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
 3460
 3461            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
 3462                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
 3463
 3464            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
 3465            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
 3466
 3467    def set_vocab(self):
 3468        self._set_vocab_sentencepiece()
 3469
 3470    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
 3471        if n_kv_head is not None and n_head != n_kv_head:
 3472            n_head //= n_kv_head
 3473
 3474        return (
 3475            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
 3476            .swapaxes(1, 2)
 3477            .reshape(weights.shape)
 3478        )
 3479
 3480
 3481@ModelBase.register("QWenLMHeadModel")
 3482class QwenModel(TextModel):
 3483    model_arch = gguf.MODEL_ARCH.QWEN
 3484
 3485    @staticmethod
 3486    def token_bytes_to_string(b):
 3487        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
 3488        byte_encoder = bytes_to_unicode()
 3489        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
 3490
 3491    @staticmethod
 3492    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
 3493        parts = [bytes([b]) for b in token]
 3494        while True:
 3495            min_idx = None
 3496            min_rank = None
 3497            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
 3498                rank = mergeable_ranks.get(pair[0] + pair[1])
 3499                if rank is not None and (min_rank is None or rank < min_rank):
 3500                    min_idx = i
 3501                    min_rank = rank
 3502            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
 3503                break
 3504            assert min_idx is not None
 3505            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
 3506        return parts
 3507
 3508    def set_vocab(self):
 3509        self._set_vocab_qwen()
 3510
 3511
 3512@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
 3513class Qwen2Model(TextModel):
 3514    model_arch = gguf.MODEL_ARCH.QWEN2
 3515
 3516    def set_vocab(self):
 3517        try:
 3518            self._set_vocab_sentencepiece()
 3519        except FileNotFoundError:
 3520            self._set_vocab_gpt2()
 3521
 3522    def set_gguf_parameters(self):
 3523        super().set_gguf_parameters()
 3524        self._try_set_pooling_type()
 3525
 3526    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3527        if self.hf_arch == "Qwen2Model":
 3528            name = f"model.{name}"  # map to Qwen2ForCausalLM tensors
 3529        if "language_model." in name:
 3530            name = name.replace("language_model.", "") # for InternVL
 3531        if name.startswith("mlp") or name.startswith("multi_modal_projector") \
 3532                or name.startswith("vision_model") or name.startswith("audio_tower") \
 3533                or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
 3534            # skip vision and audio tensors
 3535            return
 3536        yield from super().modify_tensors(data_torch, name, bid)
 3537
 3538
 3539@ModelBase.register("DreamModel")
 3540class DreamModel(TextModel):
 3541    model_arch = gguf.MODEL_ARCH.DREAM
 3542
 3543    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
 3544        tokens: list[str] = []
 3545        toktypes: list[int] = []
 3546
 3547        from transformers import AutoTokenizer
 3548        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
 3549
 3550        vocab_dict = tokenizer.get_vocab()
 3551        vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
 3552        assert max(vocab_dict.values()) < vocab_size
 3553
 3554        tokpre = self.get_vocab_base_pre(tokenizer)
 3555
 3556        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
 3557        added_vocab = tokenizer.get_added_vocab()
 3558
 3559        for i in range(vocab_size):
 3560            if i not in reverse_vocab:
 3561                tokens.append(f"[PAD{i}]")
 3562                toktypes.append(gguf.TokenType.UNUSED)
 3563            elif reverse_vocab[i] in added_vocab:
 3564                tokens.append(reverse_vocab[i])
 3565                # Check if it's a special token - treat special tokens as CONTROL tokens
 3566                if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
 3567                    if tokenizer.added_tokens_decoder[i].special:
 3568                        toktypes.append(gguf.TokenType.CONTROL)
 3569                    else:
 3570                        toktypes.append(gguf.TokenType.USER_DEFINED)
 3571                else:
 3572                    # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
 3573                    toktypes.append(gguf.TokenType.CONTROL)
 3574            else:
 3575                tokens.append(reverse_vocab[i])
 3576                toktypes.append(gguf.TokenType.NORMAL)
 3577
 3578        return tokens, toktypes, tokpre
 3579
 3580    def set_vocab(self):
 3581        try:
 3582            self._set_vocab_sentencepiece()
 3583        except FileNotFoundError:
 3584            self._set_vocab_gpt2()
 3585
 3586    def set_gguf_parameters(self):
 3587        super().set_gguf_parameters()
 3588        self._try_set_pooling_type()
 3589
 3590        # Dream models use non-causal attention for diffusion
 3591        self.gguf_writer.add_causal_attention(False)
 3592
 3593        # Add Dream-specific parameters
 3594        mask_token_id = self.hparams.get("mask_token_id")
 3595        if mask_token_id is not None:
 3596            self.gguf_writer.add_mask_token_id(mask_token_id)
 3597
 3598    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3599        # Dream model tensors should be mapped directly since it's the base model
 3600        yield from super().modify_tensors(data_torch, name, bid)
 3601
 3602
 3603@ModelBase.register("LLaDAModelLM")
 3604class LLaDAModel(TextModel):
 3605    model_arch = gguf.MODEL_ARCH.LLADA
 3606    undo_permute = True
 3607
 3608    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
 3609        tokens: list[str] = []
 3610        toktypes: list[int] = []
 3611
 3612        from transformers import AutoTokenizer
 3613        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
 3614
 3615        vocab_dict = tokenizer.get_vocab()
 3616        vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
 3617        assert max(vocab_dict.values()) < vocab_size
 3618
 3619        tokpre = self.get_vocab_base_pre(tokenizer)
 3620
 3621        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
 3622        added_vocab = tokenizer.get_added_vocab()
 3623
 3624        for i in range(vocab_size):
 3625            if i not in reverse_vocab:
 3626                tokens.append(f"[PAD{i}]")
 3627                toktypes.append(gguf.TokenType.UNUSED)
 3628            elif reverse_vocab[i] in added_vocab:
 3629                tokens.append(reverse_vocab[i])
 3630                # Check if it's a special token - treat special tokens as CONTROL tokens
 3631                if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
 3632                    if tokenizer.added_tokens_decoder[i].special:
 3633                        toktypes.append(gguf.TokenType.CONTROL)
 3634                    else:
 3635                        toktypes.append(gguf.TokenType.USER_DEFINED)
 3636                else:
 3637                    # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
 3638                    toktypes.append(gguf.TokenType.CONTROL)
 3639            else:
 3640                tokens.append(reverse_vocab[i])
 3641                toktypes.append(gguf.TokenType.NORMAL)
 3642
 3643        return tokens, toktypes, tokpre
 3644
 3645    def set_vocab(self):
 3646        self._set_vocab_gpt2()
 3647
 3648        # LLaDA specific parameters
 3649        self.gguf_writer.add_add_bos_token(True)
 3650
 3651    def set_gguf_parameters(self):
 3652        super().set_gguf_parameters()
 3653        self._try_set_pooling_type()
 3654
 3655        # Add parameters similar to LlamaModel
 3656        hparams = self.hparams
 3657        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 3658
 3659        if (rope_dim := hparams.get("head_dim")) is None:
 3660            n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
 3661            rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
 3662        self.gguf_writer.add_rope_dimension_count(rope_dim)
 3663
 3664        # Set context length for LLaDA
 3665        context_length = self.hparams.get("max_sequence_length", 4096)
 3666        self.gguf_writer.add_context_length(context_length)
 3667
 3668        # Set embedding length (dimension size)
 3669        embedding_length = self.hparams.get("d_model", 4096)
 3670        self.gguf_writer.add_embedding_length(embedding_length)
 3671
 3672        # Set feed forward length (MLP hidden size)
 3673        feed_forward_length = self.hparams.get("mlp_hidden_size", 12288)
 3674        self.gguf_writer.add_feed_forward_length(feed_forward_length)
 3675
 3676        # LLaDA models use non-causal attention for diffusion, similar to Dream
 3677        self.gguf_writer.add_causal_attention(False)
 3678
 3679        # LLaDA models don't shift their logits
 3680        self.gguf_writer.add_diffusion_shift_logits(False)
 3681
 3682    @staticmethod
 3683    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
 3684        if n_head_kv is not None and n_head != n_head_kv:
 3685            n_head = n_head_kv
 3686        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
 3687                .swapaxes(1, 2)
 3688                .reshape(weights.shape))
 3689
 3690    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3691        n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
 3692        n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
 3693
 3694        if self.undo_permute:
 3695            if name.endswith(("q_proj.weight", "q_proj.bias")):
 3696                data_torch = LLaDAModel.permute(data_torch, n_head, n_head)
 3697            if name.endswith(("k_proj.weight", "k_proj.bias")):
 3698                data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head)
 3699
 3700        # LLaDA model tensors should be mapped directly since it's the base model
 3701        yield from super().modify_tensors(data_torch, name, bid)
 3702
 3703
 3704@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM")
 3705class Ernie4_5Model(TextModel):
 3706    model_arch = gguf.MODEL_ARCH.ERNIE4_5
 3707
 3708    def set_vocab(self):
 3709        self._set_vocab_sentencepiece()
 3710
 3711    def set_gguf_parameters(self):
 3712        super().set_gguf_parameters()
 3713
 3714    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3715        num_heads = self.hparams["num_attention_heads"]
 3716        num_kv_heads = self.hparams["num_key_value_heads"]
 3717        if (head_dim := self.hparams.get("head_dim")) is None:
 3718            head_dim = self.hparams["hidden_size"] // num_heads
 3719
 3720        if "ernie." in name:
 3721            name = name.replace("ernie.", "model.")
 3722        # split the qkv weights
 3723        # qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size]
 3724        if "qkv_proj" in name:
 3725            name_q = name.replace("qkv_proj.weight", "q_proj.weight")
 3726            name_k = name.replace("qkv_proj.weight", "k_proj.weight")
 3727            name_v = name.replace("qkv_proj.weight", "v_proj.weight")
 3728            total_q_dim = num_heads * head_dim
 3729            total_k_dim = num_kv_heads * head_dim
 3730            total_v_dim = num_kv_heads * head_dim
 3731            q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0)
 3732            yield from super().modify_tensors(q_proj_weight, name_q, bid)
 3733            yield from super().modify_tensors(k_proj_weight, name_k, bid)
 3734            yield from super().modify_tensors(v_proj_weight, name_v, bid)
 3735        # split the up_gate_proj into gate and up
 3736        # up_gate_proj shape: [2 * intermediate_size, hidden_size]
 3737        elif "up_gate_proj" in name:
 3738            name_up = name.replace("up_gate_proj.weight", "up_proj.weight")
 3739            name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight")
 3740            dim_half = data_torch.shape[0] // 2
 3741            gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0)
 3742            yield from super().modify_tensors(gate_proj_weight, name_gate, bid)
 3743            yield from super().modify_tensors(up_proj_weight, name_up, bid)
 3744        else:
 3745            yield from super().modify_tensors(data_torch, name, bid)
 3746
 3747
 3748@ModelBase.register("Ernie4_5_MoeForCausalLM")
 3749class Ernie4_5MoeModel(Ernie4_5Model):
 3750    model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE
 3751    _experts: list[dict[str, Tensor]] | None = None
 3752
 3753    def __init__(self, *args, **kwargs):
 3754        super().__init__(*args, **kwargs)
 3755        self._experts = [{} for _ in range(self.block_count)]
 3756
 3757    def set_gguf_parameters(self):
 3758        super().set_gguf_parameters()
 3759        self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
 3760        self.gguf_writer.add_expert_used_count(self.hparams["moe_k"])
 3761        self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"])
 3762        self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"])
 3763        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
 3764            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
 3765        if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None:
 3766            self.gguf_writer.add_expert_shared_count(shared_expert_count)
 3767            if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None:
 3768                self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads)
 3769
 3770    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3771        # Modify correction bias name as in DeepseekV2
 3772        if name.endswith("e_score_correction_bias"):
 3773            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 3774
 3775        # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
 3776        match = re.match(r"model.mtp_block.(\d+)", name)
 3777        if match:
 3778            return
 3779
 3780        # skip all other MTP tensors for now
 3781        match = re.match(r"model.mtp_emb_norm.(\d+)", name)
 3782        if match:
 3783            return
 3784
 3785        match = re.match(r"model.mtp_hidden_norm.(\d+)", name)
 3786        if match:
 3787            return
 3788
 3789        match = re.match(r"model.mtp_linear_proj.(\d+)", name)
 3790        if match:
 3791            return
 3792
 3793        # process the experts separately
 3794        if name.find("mlp.experts") != -1:
 3795            n_experts = self.hparams["moe_num_experts"]
 3796            assert bid is not None
 3797
 3798            if self._experts is None:
 3799                self._experts = [{} for _ in range(self.block_count)]
 3800
 3801            self._experts[bid][name] = data_torch
 3802
 3803            if len(self._experts[bid]) >= n_experts * 3:
 3804                # merge the experts into a single 3d tensor
 3805                for w_name in ["gate_proj", "up_proj", "down_proj"]:
 3806                    datas: list[Tensor] = []
 3807
 3808                    for xid in range(n_experts):
 3809                        ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 3810                        datas.append(self._experts[bid][ename_to_retrieve])
 3811                        del self._experts[bid][ename_to_retrieve]
 3812
 3813                    data_torch = torch.stack(datas, dim=0)
 3814                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 3815                    yield from super().modify_tensors(data_torch, merged_name, bid)
 3816        else:
 3817            yield from ModelBase.modify_tensors(self, data_torch, name, bid)
 3818
 3819    def prepare_tensors(self):
 3820        super().prepare_tensors()
 3821
 3822        if self._experts is not None:
 3823            # flatten `list[dict[str, Tensor]]` into `list[str]`
 3824            experts = [k for d in self._experts for k in d.keys()]
 3825            if len(experts) > 0:
 3826                raise ValueError(f"Unprocessed experts: {experts}")
 3827
 3828
 3829@ModelBase.register(
 3830    "Qwen2VLModel",
 3831    "Qwen2VLForConditionalGeneration",
 3832    "Qwen2_5_VLForConditionalGeneration",
 3833    "Qwen2_5OmniModel",
 3834)
 3835class Qwen2VLModel(TextModel):
 3836    model_arch = gguf.MODEL_ARCH.QWEN2VL
 3837
 3838    def set_gguf_parameters(self):
 3839        super().set_gguf_parameters()
 3840
 3841    def set_vocab(self):
 3842        try:
 3843            self._set_vocab_sentencepiece()
 3844        except FileNotFoundError:
 3845            self._set_vocab_gpt2()
 3846
 3847    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3848        if name.startswith("thinker."):
 3849            name = name.replace("thinker.", "")
 3850        if name.startswith("visual") or name.startswith("audio") or \
 3851                name.startswith("talker") or name.startswith("token2wav"):
 3852            # skip multimodal tensors
 3853            return
 3854        yield from super().modify_tensors(data_torch, name, bid)
 3855
 3856
 3857@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
 3858class Qwen2VLVisionModel(MmprojModel):
 3859    def __init__(self, *args, **kwargs):
 3860        super().__init__(*args, **kwargs)
 3861        assert self.hparams_vision is not None
 3862        self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
 3863        # rename config.json values
 3864        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
 3865        self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
 3866        if "embed_dim" in self.hparams_vision: # qwen2vl
 3867            self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
 3868            self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
 3869
 3870    def set_gguf_parameters(self):
 3871        super().set_gguf_parameters()
 3872        assert self.hparams_vision is not None
 3873        hparams = self.hparams_vision
 3874        model_type = self.global_config['model_type']
 3875        if model_type == 'qwen2_vl':
 3876            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
 3877        elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
 3878            if model_type == 'qwen2_5_omni':
 3879                self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
 3880            else:
 3881                self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
 3882            self.gguf_writer.add_vision_use_silu(True)
 3883            # find n_wa_pattern (window attention pattern)
 3884            fullatt_block_indexes = hparams.get("fullatt_block_indexes")
 3885            assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl"
 3886            n_wa_pattern = fullatt_block_indexes[0] + 1
 3887            # validate n_wa_pattern
 3888            for i in range(1, len(fullatt_block_indexes)):
 3889                if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
 3890                    raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
 3891            self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
 3892        else:
 3893            raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
 3894        # default values below are taken from HF tranformers code
 3895        self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6))
 3896
 3897    def tensor_force_quant(self, name, new_name, bid, n_dims):
 3898        if ".position_embd." in new_name:
 3899            return gguf.GGMLQuantizationType.F32
 3900        return super().tensor_force_quant(name, new_name, bid, n_dims)
 3901
 3902    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3903        if name.startswith("visual."):
 3904            # process visual tensors
 3905            # split QKV tensors if needed
 3906            if ".qkv." in name:
 3907                if data_torch.ndim == 2: # weight
 3908                    c3, _ = data_torch.shape
 3909                else: # bias
 3910                    c3 = data_torch.shape[0]
 3911                assert c3 % 3 == 0
 3912                c = c3 // 3
 3913                wq = data_torch[:c]
 3914                wk = data_torch[c: c * 2]
 3915                wv = data_torch[c * 2:]
 3916                yield from super().modify_tensors(wq, name.replace("qkv", "q"), bid)
 3917                yield from super().modify_tensors(wk, name.replace("qkv", "k"), bid)
 3918                yield from super().modify_tensors(wv, name.replace("qkv", "v"), bid)
 3919            elif 'patch_embed.proj.weight' in name:
 3920                # split Conv3D into Conv2Ds
 3921                c1, c2, kt, kh, kw = data_torch.shape
 3922                del c1, c2, kh, kw  # unused
 3923                assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
 3924                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight"  , data_torch[:, :, 0, ...])
 3925                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
 3926            else:
 3927                yield from super().modify_tensors(data_torch, name, bid)
 3928
 3929
 3930@ModelBase.register("Qwen2_5OmniModel")
 3931class Qwen25OmniModel(Qwen2VLVisionModel):
 3932    has_vision_encoder = True
 3933    has_audio_encoder = True
 3934
 3935    def __init__(self, *args, **kwargs):
 3936        super().__init__(*args, **kwargs)
 3937        assert self.hparams_audio is not None
 3938        self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
 3939        self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
 3940        self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
 3941
 3942    def set_gguf_parameters(self):
 3943        super().set_gguf_parameters()
 3944        assert self.hparams_audio is not None
 3945        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
 3946        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
 3947
 3948    def get_vision_config(self) -> dict[str, Any] | None:
 3949        return self.global_config["thinker_config"].get("vision_config")
 3950
 3951    def get_audio_config(self) -> dict[str, Any] | None:
 3952        return self.global_config["thinker_config"].get("audio_config")
 3953
 3954    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 3955        # SinusoidsPositionEmbedding
 3956        assert self.hparams_audio is not None
 3957        max_timescale = 10000
 3958        length = 1500
 3959        channels = self.hparams_audio["hidden_size"]
 3960        log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
 3961        inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
 3962        scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
 3963        pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
 3964        yield ("audio_tower.embed_positions.weight", pos_embd)
 3965
 3966    def tensor_force_quant(self, name, new_name, bid, n_dims):
 3967        if ".conv" in name and ".weight" in name:
 3968            return gguf.GGMLQuantizationType.F16
 3969        return super().tensor_force_quant(name, new_name, bid, n_dims)
 3970
 3971    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 3972        if name.startswith("thinker."):
 3973            name = name.replace("thinker.", "")
 3974
 3975        if name.startswith("audio_tower"):
 3976            # process audio tensors
 3977            if "conv1.bias" in name or "conv2.bias" in name:
 3978                # transpose conv1 and conv2 bias
 3979                data_torch = data_torch.unsqueeze(-1)
 3980            if "audio_bos_eos_token" in name:
 3981                # this tensor is left unused in transformers code
 3982                # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
 3983                return
 3984        yield from super().modify_tensors(data_torch, name, bid)
 3985
 3986
 3987@ModelBase.register("InternVisionModel")
 3988class InternVisionModel(MmprojModel):
 3989    def set_gguf_parameters(self):
 3990        assert self.hparams_vision is not None
 3991        if isinstance(self.hparams_vision['image_size'], list):
 3992            self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0]
 3993        if isinstance(self.hparams_vision['patch_size'], list):
 3994            self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0]
 3995        super().set_gguf_parameters()
 3996
 3997        hparams = self.hparams
 3998        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
 3999        self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
 4000        # hidden_act
 4001        if hparams["hidden_act"] == "silu":
 4002            self.gguf_writer.add_vision_use_silu(True)
 4003        elif hparams["hidden_act"] == "gelu":
 4004            self.gguf_writer.add_vision_use_gelu(True)
 4005        else:
 4006            raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
 4007        # downsample_ratio
 4008        downsample_ratio = self.global_config.get("downsample_ratio")
 4009        assert downsample_ratio is not None
 4010        self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
 4011
 4012    def tensor_force_quant(self, name, new_name, bid, n_dims):
 4013        if ".position_embd." in new_name:
 4014            return gguf.GGMLQuantizationType.F32
 4015        return super().tensor_force_quant(name, new_name, bid, n_dims)
 4016
 4017    def _mapping_interns1_name(self, name):
 4018        names_map = {
 4019            "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias",
 4020            "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight",
 4021            "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias",
 4022            "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight",
 4023            "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias",
 4024            "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight",
 4025        }
 4026        if name in names_map:
 4027            name = names_map[name]
 4028        return name
 4029
 4030    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4031        vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector']
 4032        # deal with intern-s1 special case
 4033        name = self._mapping_interns1_name(name)
 4034        if any([name.startswith(prefix) for prefix in vision_prefix]):
 4035            # process visual tensors
 4036            # correct name
 4037            if name.startswith("vision_model"):
 4038                name = "vision_tower." + name
 4039            if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"):
 4040                name += ".weight"
 4041            # split QKV tensors if needed
 4042            if ".qkv." in name:
 4043                if data_torch.ndim == 2: # weight
 4044                    c3, _ = data_torch.shape
 4045                else: # bias
 4046                    c3 = data_torch.shape[0]
 4047                assert c3 % 3 == 0
 4048                c = c3 // 3
 4049                wq = data_torch[:c]
 4050                wk = data_torch[c: c * 2]
 4051                wv = data_torch[c * 2:]
 4052                yield from super().modify_tensors(wq, name.replace("attn.qkv", "self_attn.q_proj"), bid)
 4053                yield from super().modify_tensors(wk, name.replace("attn.qkv", "self_attn.k_proj"), bid)
 4054                yield from super().modify_tensors(wv, name.replace("attn.qkv", "self_attn.v_proj"), bid)
 4055            else:
 4056                yield from super().modify_tensors(data_torch, name, bid)
 4057
 4058
 4059@ModelBase.register("WavTokenizerDec")
 4060class WavTokenizerDecModel(TextModel):
 4061    model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
 4062
 4063    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4064        if \
 4065                name.endswith("codebook.cluster_size") or \
 4066                name.endswith("codebook.embed_avg") or \
 4067                name.endswith("codebook.inited"):
 4068            logger.debug(f"Skipping {name!r}")
 4069            return
 4070
 4071        logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
 4072
 4073        yield from super().modify_tensors(data_torch, name, bid)
 4074
 4075    def set_vocab(self):
 4076        self._set_vocab_none()
 4077
 4078    def set_gguf_parameters(self):
 4079        super().set_gguf_parameters()
 4080        self.gguf_writer.add_vocab_size         (self.hparams["vocab_size"])
 4081        self.gguf_writer.add_features_length    (self.hparams["n_embd_features"])
 4082        self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
 4083        self.gguf_writer.add_group_norm_eps     (self.hparams["group_norm_epsilon"])
 4084        self.gguf_writer.add_group_norm_groups  (self.hparams["group_norm_groups"])
 4085
 4086        self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
 4087        self.gguf_writer.add_posnet_block_count     (self.hparams["posnet"]["n_layer"])
 4088
 4089        self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
 4090        self.gguf_writer.add_convnext_block_count     (self.hparams["convnext"]["n_layer"])
 4091
 4092        self.gguf_writer.add_causal_attention(False)
 4093
 4094
 4095@ModelBase.register("Qwen2MoeForCausalLM")
 4096class Qwen2MoeModel(TextModel):
 4097    model_arch = gguf.MODEL_ARCH.QWEN2MOE
 4098
 4099    def set_gguf_parameters(self):
 4100        super().set_gguf_parameters()
 4101        if (n_experts := self.hparams.get("num_experts")) is not None:
 4102            self.gguf_writer.add_expert_count(n_experts)
 4103        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
 4104            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
 4105            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
 4106        if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
 4107            self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
 4108            logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
 4109
 4110    _experts: list[dict[str, Tensor]] | None = None
 4111
 4112    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4113        # process the experts separately
 4114        name = name.replace("language_model.", "") # InternVL
 4115
 4116        # handle aggregated expert tensors
 4117        # GGUF stores dimensions reversed from PyTorch, so:
 4118        # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
 4119        # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
 4120        # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
 4121        if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
 4122            mapped = f"{name}.weight" if not name.endswith(".weight") else name
 4123            # HF: [n_expert, n_embd, n_ff] -> GGML: {n_ff, n_embd, n_expert}
 4124            yield from super().modify_tensors(data_torch, mapped, bid)
 4125            return
 4126
 4127        if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
 4128            if data_torch.ndim < 3 or data_torch.shape[-2] % 2 != 0:
 4129                raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
 4130            # HF: [n_expert, 2*n_ff, n_embd] -> split on dim=-2
 4131            n_ff = data_torch.shape[-2] // 2
 4132            gate = data_torch[..., :n_ff, :].contiguous()
 4133            up = data_torch[..., n_ff:, :].contiguous()
 4134            # gate/up: [n_expert, n_ff, n_embd] -> GGML: {n_embd, n_ff, n_expert}
 4135            base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj")
 4136            mapped_gate = f"{base_name}.gate_proj.weight"
 4137            mapped_up = f"{base_name}.up_proj.weight"
 4138            yield from super().modify_tensors(gate, mapped_gate, bid)
 4139            yield from super().modify_tensors(up, mapped_up, bid)
 4140            return
 4141
 4142        if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
 4143            # skip visual tensors
 4144            return
 4145
 4146        if name.find("experts") != -1:
 4147            n_experts = self.hparams["num_experts"]
 4148            assert bid is not None
 4149
 4150            if self._experts is None:
 4151                self._experts = [{} for _ in range(self.block_count)]
 4152
 4153            self._experts[bid][name] = data_torch
 4154
 4155            if len(self._experts[bid]) >= n_experts * 3:
 4156                # merge the experts into a single 3d tensor
 4157                for w_name in ["down_proj", "gate_proj", "up_proj"]:
 4158                    datas: list[Tensor] = []
 4159
 4160                    for xid in range(n_experts):
 4161                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 4162                        datas.append(self._experts[bid][ename])
 4163                        del self._experts[bid][ename]
 4164
 4165                    data_torch = torch.stack(datas, dim=0)
 4166
 4167                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 4168
 4169                    yield from super().modify_tensors(data_torch, merged_name, bid)
 4170                return
 4171            else:
 4172                return
 4173
 4174        yield from super().modify_tensors(data_torch, name, bid)
 4175
 4176    def prepare_tensors(self):
 4177        super().prepare_tensors()
 4178
 4179        if self._experts is not None:
 4180            # flatten `list[dict[str, Tensor]]` into `list[str]`
 4181            experts = [k for d in self._experts for k in d.keys()]
 4182            if len(experts) > 0:
 4183                raise ValueError(f"Unprocessed experts: {experts}")
 4184
 4185
 4186@ModelBase.register("Qwen3ForCausalLM")
 4187class Qwen3Model(Qwen2Model):
 4188    model_arch = gguf.MODEL_ARCH.QWEN3
 4189
 4190    # extra logic for rerank models
 4191    is_rerank: bool = False
 4192    is_tied_embeddings: bool = False
 4193    token_false_id: int | None = None
 4194    token_true_id: int | None = None
 4195
 4196    def __init__(self, *args, **kwargs):
 4197        super().__init__(*args, **kwargs)
 4198
 4199        # track for intern-s1-mini
 4200        hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
 4201        self.origin_hf_arch = hparams.get('architectures', [None])[0]
 4202
 4203        # a bit hacky, but currently the only way to detect if this is a rerank model
 4204        # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
 4205        readme_path = self.dir_model / "README.md"
 4206        readme_text = ""
 4207        if readme_path.exists():
 4208            with readme_path.open("r", encoding="utf-8") as f:
 4209                readme_text = f.read()
 4210        if "# Qwen3-Reranker" in readme_text:
 4211            self._find_rerank_config()
 4212
 4213    def set_vocab(self):
 4214        # deal with intern-s1-mini
 4215        if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
 4216            self._set_vocab_interns1()
 4217            return
 4218
 4219        super().set_vocab()
 4220
 4221    def _find_rerank_config(self):
 4222        from transformers import AutoTokenizer
 4223        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
 4224
 4225        self.is_rerank = True
 4226        self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
 4227        self.token_false_id = tokenizer.convert_tokens_to_ids("no")
 4228        self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
 4229        self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
 4230
 4231        assert self.token_false_id is not None and self.token_true_id is not None
 4232
 4233    def set_gguf_parameters(self):
 4234        super().set_gguf_parameters()
 4235        if self.is_rerank:
 4236            self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
 4237            self.gguf_writer.add_classifier_output_labels(["yes", "no"])
 4238            self.gguf_writer.add_chat_template([{
 4239                "name": "rerank",
 4240                "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
 4241                            "<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
 4242                            "<|im_start|>assistant\n<think>\n\n</think>\n\n"
 4243            }])
 4244
 4245    def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
 4246        # extract "yes" and "no" tokens from the output lm_head tensor
 4247        false_row = data_torch[self.token_false_id]
 4248        true_row = data_torch[self.token_true_id]
 4249        return torch.stack([true_row, false_row], dim=0)
 4250
 4251    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4252        if "model.vision_" in name:
 4253            # skip multimodal tensors
 4254            return
 4255
 4256        if self.is_rerank:
 4257            is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
 4258            is_real_head = not self.is_tied_embeddings and "lm_head" in name
 4259            if is_tied_head or is_real_head:
 4260                cls_out_head = (
 4261                    gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
 4262                    self._get_cls_out_tensor(data_torch),
 4263                )
 4264                yield cls_out_head
 4265                if is_tied_head:
 4266                    yield from super().modify_tensors(data_torch, name, bid)
 4267                return
 4268
 4269        yield from super().modify_tensors(data_torch, name, bid)
 4270
 4271
 4272@ModelBase.register("Qwen3MoeForCausalLM")
 4273class Qwen3MoeModel(Qwen2MoeModel):
 4274    model_arch = gguf.MODEL_ARCH.QWEN3MOE
 4275
 4276    def __init__(self, *args, **kwargs):
 4277        super().__init__(*args, **kwargs)
 4278        hparams = ModelBase.load_hparams(self.dir_model, False)
 4279        self.origin_hf_arch = hparams.get('architectures', [None])[0]
 4280
 4281    def set_vocab(self):
 4282        # deal with intern-s1
 4283        if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
 4284            self._set_vocab_interns1()
 4285            return
 4286
 4287        super().set_vocab()
 4288
 4289
 4290@ModelBase.register("Qwen3NextForCausalLM")
 4291class Qwen3NextModel(Qwen2MoeModel):
 4292    model_arch = gguf.MODEL_ARCH.QWEN3NEXT
 4293
 4294    def set_gguf_parameters(self):
 4295        super().set_gguf_parameters()
 4296        self.gguf_writer.add_ssm_conv_kernel(self.hparams["linear_conv_kernel_dim"])
 4297        self.gguf_writer.add_ssm_state_size(self.hparams["linear_key_head_dim"])
 4298        self.gguf_writer.add_ssm_group_count(self.hparams["linear_num_key_heads"])
 4299        self.gguf_writer.add_ssm_time_step_rank(self.hparams["linear_num_value_heads"])
 4300        self.gguf_writer.add_ssm_inner_size(self.hparams["linear_value_head_dim"] * self.hparams["linear_num_value_heads"])
 4301        self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
 4302        if (rope_dim := self.hparams.get("head_dim")) is None:
 4303            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 4304        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
 4305
 4306    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4307        if name.startswith("mtp"):
 4308            return  # ignore MTP layers for now
 4309        if name.endswith(".A_log"):
 4310            data_torch = -torch.exp(data_torch)
 4311        elif name.endswith(".dt_bias"):
 4312            name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
 4313        elif "conv1d" in name:
 4314            data_torch = data_torch.squeeze()
 4315        elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
 4316            data_torch = data_torch + 1
 4317
 4318        if "in_proj_qkvz.weight" in name:
 4319            # original order:  [q, k, v, z] * head_count
 4320            # corrected order: [q * head_count, k * head_count, v * head_count, z * head_count]
 4321            head_k_dim = self.hparams["linear_key_head_dim"]
 4322            head_v_dim = self.hparams["linear_value_head_dim"]
 4323            num_v_heads = self.hparams["linear_num_value_heads"]
 4324            num_k_heads = self.hparams["linear_num_key_heads"]
 4325            hidden_size = self.hparams["hidden_size"]
 4326            split_arg_list_qkvz = [
 4327                head_k_dim, # q partition
 4328                head_k_dim, # k partition
 4329                (num_v_heads // num_k_heads * head_v_dim), # v partition
 4330                (num_v_heads // num_k_heads * head_v_dim), # z partition
 4331            ]
 4332            # view as (n_embd, head_count, [q+k+v+z])
 4333            data_torch = data_torch.permute(1, 0).contiguous()
 4334            data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz))
 4335            # split into q, k, v, z
 4336            q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1)
 4337            # flatten dim + head_count
 4338            q = q.contiguous().view(hidden_size, -1)
 4339            k = k.contiguous().view(hidden_size, -1)
 4340            v = v.contiguous().view(hidden_size, -1)
 4341            z = z.contiguous().view(hidden_size, -1)
 4342            # stack back
 4343            qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous()
 4344            z = z.permute(1, 0).contiguous()
 4345            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV,  bid, ".weight"), qkv)
 4346            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z)
 4347        else:
 4348            yield from super().modify_tensors(data_torch, name, bid)
 4349
 4350
 4351@ModelBase.register("RND1")
 4352class RND1Model(Qwen2MoeModel):
 4353    model_arch = gguf.MODEL_ARCH.RND1
 4354
 4355    def set_gguf_parameters(self):
 4356        super().set_gguf_parameters()
 4357
 4358        # RND1 specific parameters
 4359        # RND1 uses bidirectional attention
 4360        self.gguf_writer.add_causal_attention(False)
 4361
 4362        if (mask_token_id := self.hparams.get("mask_token_id")) is not None:
 4363            self.gguf_writer.add_mask_token_id(mask_token_id)
 4364
 4365
 4366@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration")
 4367class Qwen3VLVisionModel(MmprojModel):
 4368    def __init__(self, *args, **kwargs):
 4369        super().__init__(*args, **kwargs)
 4370        assert self.hparams_vision is not None
 4371        # Compute image_size if not present
 4372        if "image_size" not in self.hparams_vision:
 4373            # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
 4374            num_pos = self.hparams_vision.get("num_position_embeddings", 2304)
 4375            patch_size = self.hparams_vision.get("patch_size", 16)
 4376            # num_position_embeddings = (image_size / patch_size) ** 2
 4377            # So image_size = sqrt(num_position_embeddings) * patch_size
 4378            image_size = int(num_pos**0.5 * patch_size)
 4379            self.hparams_vision["image_size"] = image_size
 4380
 4381        # Rename config values for compatibility
 4382        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
 4383        self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
 4384
 4385        self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0)
 4386        for idx in self.hparams_vision.get("deepstack_visual_indexes", []):
 4387            self.is_deepstack_layers[idx] = True
 4388
 4389    def set_gguf_parameters(self):
 4390        super().set_gguf_parameters()
 4391        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
 4392        self.gguf_writer.add_vision_use_gelu(True)
 4393
 4394        if self.hparams_vision is not None:
 4395            merge_size = self.hparams_vision.get("spatial_merge_size")
 4396            if merge_size is not None:
 4397                self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
 4398
 4399        # Use text config's rms_norm_eps for vision attention layernorm eps
 4400        rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6)
 4401        self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
 4402
 4403        if self.is_deepstack_layers:
 4404            self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers)
 4405
 4406    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4407        assert self.hparams_vision is not None
 4408        # Skip text model tensors - they go in the text model file
 4409        if name.startswith("model.language_model.") or name.startswith("lm_head."):
 4410            return
 4411
 4412        # Skip MTP tensors
 4413        if name.startswith("mtp."):
 4414            return
 4415
 4416        if name.startswith("model.visual."):
 4417            name = name.replace("model.visual.", "visual.", 1)
 4418
 4419        if name.startswith("visual.deepstack_merger_list."):
 4420            prefix, rest = name.split(".", maxsplit=3)[2:]
 4421            # prefix is the layer index, convert to absolute clip layer index!
 4422            idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)]
 4423            target = rest
 4424
 4425            tensor_type: gguf.MODEL_TENSOR
 4426            if target.startswith("norm."):
 4427                tensor_type = gguf.MODEL_TENSOR.V_DS_NORM
 4428                suffix = target.split(".", 1)[1]
 4429            elif target.startswith("linear_fc1."):
 4430                tensor_type = gguf.MODEL_TENSOR.V_DS_FC1
 4431                suffix = target.split(".", 1)[1]
 4432            elif target.startswith("linear_fc2."):
 4433                tensor_type = gguf.MODEL_TENSOR.V_DS_FC2
 4434                suffix = target.split(".", 1)[1]
 4435            else:
 4436                raise ValueError(f"Unexpected deepstack tensor: {name}")
 4437
 4438            new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}")
 4439            yield from super().modify_tensors(data_torch, new_name, bid)
 4440            return
 4441
 4442        if name.startswith("visual.merger."):
 4443            suffix = name.split(".", 2)[2]
 4444            if suffix.startswith("linear_fc"):
 4445                fc_idx_str, tail = suffix.split(".", 1)
 4446                fc_num = int(fc_idx_str.replace("linear_fc", ""))
 4447                # Qwen3VL has linear_fc1 and linear_fc2
 4448                # Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2)
 4449                if fc_num == 1:
 4450                    fc_idx = 0
 4451                elif fc_num == 2:
 4452                    fc_idx = 2
 4453                else:
 4454                    raise ValueError(f"unexpected fc index {fc_num} in {name}")
 4455                new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}")
 4456            elif suffix.startswith("norm."):
 4457                new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}")
 4458            else:
 4459                raise ValueError(f"Unexpected merger tensor: {name}")
 4460            yield (new_name, data_torch)
 4461            return
 4462
 4463        if name == "visual.patch_embed.proj.weight":
 4464            # split Conv3D into Conv2Ds along temporal dimension
 4465            c1, c2, kt, _, _ = data_torch.shape
 4466            del c1, c2
 4467            if kt != 2:
 4468                raise ValueError("Current implementation only supports temporal_patch_size of 2")
 4469            yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...])
 4470            yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
 4471            return
 4472
 4473        if name == "visual.patch_embed.proj.bias":
 4474            # Include the bias - it's used by the C++ code
 4475            yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)
 4476            return
 4477
 4478        if name.startswith("visual."):
 4479            yield from super().modify_tensors(data_torch, name, bid)
 4480            return
 4481
 4482        # Fall back to parent class for other tensors
 4483        yield from super().modify_tensors(data_torch, name, bid)
 4484
 4485
 4486@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration")
 4487class Glm4VVisionModel(Qwen3VLVisionModel):
 4488    def set_gguf_parameters(self):
 4489        MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
 4490        assert self.hparams_vision is not None
 4491        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
 4492
 4493        hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
 4494        if hidden_act == "gelu":
 4495            self.gguf_writer.add_vision_use_gelu(True)
 4496        elif hidden_act == "silu":
 4497            self.gguf_writer.add_vision_use_silu(True)
 4498
 4499        rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5)
 4500        self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
 4501
 4502    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4503        if name.startswith("model.visual."):
 4504            name = name.replace("model.visual.", "visual.")
 4505        if name.startswith("visual.merger."):
 4506            yield from ModelBase.modify_tensors(self, data_torch, name, bid)
 4507            return
 4508        yield from super().modify_tensors(data_torch, name, bid)
 4509
 4510
 4511@ModelBase.register("Qwen3VLForConditionalGeneration")
 4512class Qwen3VLTextModel(Qwen3Model):
 4513    model_arch = gguf.MODEL_ARCH.QWEN3VL
 4514
 4515    def set_gguf_parameters(self):
 4516        super().set_gguf_parameters()
 4517
 4518        # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
 4519        vision_config = self.hparams.get("vision_config", {})
 4520        deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
 4521        self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
 4522
 4523    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4524        # Skip vision tensors - they go in the mmproj file
 4525        if name.startswith("model.visual."):
 4526            return
 4527
 4528        yield from super().modify_tensors(data_torch, name, bid)
 4529
 4530
 4531@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
 4532class Qwen3VLMoeTextModel(Qwen3MoeModel):
 4533    model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
 4534
 4535    def set_gguf_parameters(self):
 4536        super().set_gguf_parameters()
 4537        vision_config = self.hparams.get("vision_config", {})
 4538        deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
 4539        self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
 4540
 4541    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4542        # Skip vision tensors - they go in the mmproj file
 4543        if name.startswith("model.visual."):
 4544            return
 4545
 4546        # Qwen3VL has transposed packed tensors, so we treat it differently from general Qwen2MoE packed tensors
 4547        if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
 4548            name = name.replace("language_model.", "")
 4549            mapped = f"{name}.weight" if not name.endswith(".weight") else name
 4550            permuted = data_torch.permute(0, 2, 1).contiguous()
 4551            yield from ModelBase.modify_tensors(self, permuted, mapped, bid)
 4552            return
 4553
 4554        if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
 4555            name = name.replace("language_model.", "")
 4556            if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
 4557                raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
 4558            split_dim = data_torch.shape[-1] // 2
 4559            gate = data_torch[..., :split_dim].contiguous()
 4560            up = data_torch[..., split_dim:].contiguous()
 4561            # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
 4562            # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
 4563            # Need PyTorch: (128, 768, 2048) [reversed of GGML]
 4564            # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
 4565            base_name = name.removesuffix(".weight")
 4566            base = base_name.rsplit('.', 1)[0]
 4567            mapped_gate = f"{base}.gate_proj.weight"
 4568            mapped_up = f"{base}.up_proj.weight"
 4569            perm_gate = gate.permute(0, 2, 1).contiguous()
 4570            perm_up = up.permute(0, 2, 1).contiguous()
 4571            yield from ModelBase.modify_tensors(self, perm_gate, mapped_gate, bid)
 4572            yield from ModelBase.modify_tensors(self, perm_up, mapped_up, bid)
 4573            return
 4574
 4575        yield from super().modify_tensors(data_torch, name, bid)
 4576
 4577
 4578class _LinearAttentionVReorderBase(Qwen3NextModel):
 4579    model_arch = gguf.MODEL_ARCH.QWEN3NEXT  # overridden by subclasses
 4580    """reorders V heads from grouped to tiled order for ggml broadcast
 4581
 4582    see https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306
 4583
 4584    Linear attention may has num_k_heads < num_v_heads. The HF weights store
 4585    V heads grouped by K head: [G0_v0..v{r-1}, G1_v0..v{r-1}, ...].
 4586    ggml binary ops use tiled broadcast: [K0, K1, ..., K0, K1, ...].
 4587    We reorder V heads to tiled order so ggml_repeat can replace the expensive
 4588    interleaved repeat: [G0_v0, G1_v0, ..., G0_v1, G1_v1, ...].
 4589    """
 4590
 4591    @staticmethod
 4592    def _reorder_v_heads(tensor: Tensor, dim: int, num_k_heads: int, num_v_per_k: int, head_dim: int) -> Tensor:
 4593        """Reorder V heads from grouped (by K head) to tiled order along the given dimension."""
 4594        shape = list(tensor.shape)
 4595        if dim < 0:
 4596            dim += len(shape)
 4597        new_shape = shape[:dim] + [num_k_heads, num_v_per_k, head_dim] + shape[dim + 1:]
 4598        tensor = tensor.reshape(*new_shape)
 4599        perm = list(range(len(new_shape)))
 4600        perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim]
 4601        return tensor.permute(*perm).contiguous().reshape(*shape)
 4602
 4603    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4604        num_k_heads = self.hparams.get("linear_num_key_heads", 0)
 4605        num_v_heads = self.hparams.get("linear_num_value_heads", 0)
 4606
 4607        if num_k_heads > 0 and num_v_heads > 0 and num_k_heads != num_v_heads and "linear_attn." in name:
 4608            head_k_dim = self.hparams["linear_key_head_dim"]
 4609            head_v_dim = self.hparams["linear_value_head_dim"]
 4610            num_v_per_k = num_v_heads // num_k_heads
 4611
 4612            if ".in_proj_qkv." in name:
 4613                # QKV weight: reorder only the V rows
 4614                q_dim = head_k_dim * num_k_heads
 4615                k_dim = head_k_dim * num_k_heads
 4616                q = data_torch[:q_dim]
 4617                k = data_torch[q_dim:q_dim + k_dim]
 4618                v = data_torch[q_dim + k_dim:]
 4619                v = self._reorder_v_heads(v, 0, num_k_heads, num_v_per_k, head_v_dim)
 4620                data_torch = torch.cat([q, k, v], dim=0)
 4621
 4622            elif ".in_proj_z." in name:
 4623                # Z gate weight: reorder rows (num_v_heads * head_v_dim)
 4624                data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, head_v_dim)
 4625
 4626            elif ".in_proj_b." in name or ".in_proj_a." in name:
 4627                # Beta/Alpha weight: reorder rows (num_v_heads, head_dim=1)
 4628                data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, 1)
 4629
 4630            elif ".A_log" in name or ".dt_bias" in name or ".dt_proj" in name:
 4631                # A_log / dt_bias: 1D parameters with num_v_heads elements
 4632                if data_torch.ndim == 1:
 4633                    data_torch = self._reorder_v_heads(
 4634                        data_torch.unsqueeze(-1), 0, num_k_heads, num_v_per_k, 1
 4635                    ).squeeze(-1)
 4636                else:
 4637                    data_torch = self._reorder_v_heads(data_torch, -1, num_k_heads, num_v_per_k, 1)
 4638
 4639            elif ".conv1d" in name:
 4640                # Conv1d kernel: reorder only the V channel portion
 4641                data = data_torch.squeeze()
 4642                qk_channels = head_k_dim * num_k_heads * 2
 4643                qk_part = data[:qk_channels]
 4644                v_part = data[qk_channels:]
 4645                v_part = self._reorder_v_heads(v_part, 0, num_k_heads, num_v_per_k, head_v_dim)
 4646                data_torch = torch.cat([qk_part, v_part], dim=0)
 4647
 4648            elif ".out_proj." in name:
 4649                # Out projection weight: reorder columns (input dimension)
 4650                data_torch = self._reorder_v_heads(data_torch, 1, num_k_heads, num_v_per_k, head_v_dim)
 4651
 4652        yield from super().modify_tensors(data_torch, name, bid)
 4653
 4654
 4655@ModelBase.register("Qwen3_5ForConditionalGeneration")
 4656class Qwen3_5TextModel(_LinearAttentionVReorderBase):
 4657    model_arch = gguf.MODEL_ARCH.QWEN35
 4658
 4659
 4660@ModelBase.register("Qwen3_5MoeForConditionalGeneration")
 4661class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase):
 4662    model_arch = gguf.MODEL_ARCH.QWEN35MOE
 4663
 4664
 4665@ModelBase.register("GPT2LMHeadModel")
 4666class GPT2Model(TextModel):
 4667    model_arch = gguf.MODEL_ARCH.GPT2
 4668
 4669    def set_gguf_parameters(self):
 4670        self.gguf_writer.add_block_count(self.block_count)
 4671        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
 4672        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
 4673        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
 4674        self.gguf_writer.add_head_count(self.hparams["n_head"])
 4675        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
 4676        self.gguf_writer.add_file_type(self.ftype)
 4677
 4678    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4679        # we don't need these
 4680        if name.endswith((".attn.bias", ".attn.masked_bias")):
 4681            yield from super().modify_tensors(data_torch, name, bid)
 4682            return
 4683
 4684        if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
 4685            data_torch = data_torch.transpose(1, 0)
 4686
 4687        new_name = self.map_tensor_name(name)
 4688
 4689        yield from super().modify_tensors(data_torch, new_name, bid)
 4690
 4691
 4692@ModelBase.register("PhiForCausalLM")
 4693class Phi2Model(TextModel):
 4694    model_arch = gguf.MODEL_ARCH.PHI2
 4695
 4696    def set_gguf_parameters(self):
 4697        rot_pct = self.find_hparam(["partial_rotary_factor"])
 4698        n_embd = self.find_hparam(["hidden_size", "n_embd"])
 4699        n_head = self.find_hparam(["num_attention_heads", "n_head"])
 4700
 4701        self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
 4702
 4703        self.gguf_writer.add_embedding_length(n_embd)
 4704        self.gguf_writer.add_feed_forward_length(4 * n_embd)
 4705        self.gguf_writer.add_block_count(self.block_count)
 4706        self.gguf_writer.add_head_count(n_head)
 4707        self.gguf_writer.add_head_count_kv(n_head)
 4708        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
 4709        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
 4710        self.gguf_writer.add_file_type(self.ftype)
 4711        self.gguf_writer.add_add_bos_token(False)
 4712
 4713
 4714@ModelBase.register("Phi3ForCausalLM")
 4715class Phi3MiniModel(TextModel):
 4716    model_arch = gguf.MODEL_ARCH.PHI3
 4717
 4718    def set_vocab(self):
 4719        # Phi-4 model uses GPT2Tokenizer
 4720        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
 4721        if tokenizer_config_file.is_file():
 4722            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
 4723                tokenizer_config_json = json.load(f)
 4724                tokenizer_class = tokenizer_config_json['tokenizer_class']
 4725                if tokenizer_class == 'GPT2Tokenizer':
 4726                    return self._set_vocab_gpt2()
 4727
 4728        from sentencepiece import SentencePieceProcessor
 4729
 4730        tokenizer_path = self.dir_model / 'tokenizer.model'
 4731
 4732        if not tokenizer_path.is_file():
 4733            raise ValueError(f'Error: Missing {tokenizer_path}')
 4734
 4735        tokenizer = SentencePieceProcessor()
 4736        tokenizer.LoadFromFile(str(tokenizer_path))
 4737
 4738        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
 4739
 4740        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
 4741        scores: list[float] = [-10000.0] * vocab_size
 4742        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
 4743
 4744        for token_id in range(tokenizer.vocab_size()):
 4745
 4746            piece = tokenizer.IdToPiece(token_id)
 4747            text = piece.encode("utf-8")
 4748            score = tokenizer.GetScore(token_id)
 4749
 4750            toktype = SentencePieceTokenTypes.NORMAL
 4751            if tokenizer.IsUnknown(token_id):
 4752                toktype = SentencePieceTokenTypes.UNKNOWN
 4753            elif tokenizer.IsControl(token_id):
 4754                toktype = SentencePieceTokenTypes.CONTROL
 4755            elif tokenizer.IsUnused(token_id):
 4756                toktype = SentencePieceTokenTypes.UNUSED
 4757            elif tokenizer.IsByte(token_id):
 4758                toktype = SentencePieceTokenTypes.BYTE
 4759
 4760            tokens[token_id] = text
 4761            scores[token_id] = score
 4762            toktypes[token_id] = toktype
 4763
 4764        added_tokens_file = self.dir_model / 'added_tokens.json'
 4765        if added_tokens_file.is_file():
 4766            with open(added_tokens_file, "r", encoding="utf-8") as f:
 4767                added_tokens_json = json.load(f)
 4768
 4769                for key in added_tokens_json:
 4770                    token_id = added_tokens_json[key]
 4771                    if token_id >= vocab_size:
 4772                        logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
 4773                        continue
 4774
 4775                    tokens[token_id] = key.encode("utf-8")
 4776                    scores[token_id] = -1000.0
 4777                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 4778
 4779        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
 4780        if tokenizer_config_file.is_file():
 4781            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
 4782                tokenizer_config_json = json.load(f)
 4783                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
 4784                for token_id, foken_data in added_tokens_decoder.items():
 4785                    token_id = int(token_id)
 4786                    token = foken_data["content"].encode("utf-8")
 4787                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
 4788                        if tokens[token_id] != token:
 4789                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
 4790                    tokens[token_id] = token
 4791                    scores[token_id] = -1000.0
 4792                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 4793                    if foken_data.get("special"):
 4794                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
 4795
 4796        tokenizer_file = self.dir_model / 'tokenizer.json'
 4797        if tokenizer_file.is_file():
 4798            with open(tokenizer_file, "r", encoding="utf-8") as f:
 4799                tokenizer_json = json.load(f)
 4800                added_tokens = tokenizer_json.get("added_tokens", [])
 4801                for foken_data in added_tokens:
 4802                    token_id = int(foken_data["id"])
 4803                    token = foken_data["content"].encode("utf-8")
 4804                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
 4805                        if tokens[token_id] != token:
 4806                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
 4807                    tokens[token_id] = token
 4808                    scores[token_id] = -1000.0
 4809                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 4810                    if foken_data.get("special"):
 4811                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
 4812
 4813        self.gguf_writer.add_tokenizer_model("llama")
 4814        self.gguf_writer.add_tokenizer_pre("default")
 4815        self.gguf_writer.add_token_list(tokens)
 4816        self.gguf_writer.add_token_scores(scores)
 4817        self.gguf_writer.add_token_types(toktypes)
 4818
 4819        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 4820        special_vocab.add_to_gguf(self.gguf_writer)
 4821
 4822    def set_gguf_parameters(self):
 4823        n_embd = self.find_hparam(["hidden_size", "n_embd"])
 4824        n_head = self.find_hparam(["num_attention_heads", "n_head"])
 4825        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
 4826        rms_eps = self.find_hparam(["rms_norm_eps"])
 4827        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
 4828        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
 4829        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
 4830        rope_dims = int(rot_pct * n_embd) // n_head
 4831
 4832        self.gguf_writer.add_context_length(max_pos_embds)
 4833        self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
 4834        self.gguf_writer.add_embedding_length(n_embd)
 4835        self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
 4836        self.gguf_writer.add_block_count(self.block_count)
 4837        self.gguf_writer.add_head_count(n_head)
 4838        self.gguf_writer.add_head_count_kv(n_head_kv)
 4839        self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
 4840        self.gguf_writer.add_rope_dimension_count(rope_dims)
 4841        self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"])
 4842        self.gguf_writer.add_file_type(self.ftype)
 4843        sliding_window = self.hparams.get("sliding_window")
 4844        # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
 4845        if sliding_window is None:
 4846            sliding_window = 0
 4847        self.gguf_writer.add_sliding_window(sliding_window)
 4848
 4849    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 4850        n_embd = self.find_hparam(["hidden_size", "n_embd"])
 4851        n_head = self.find_hparam(["num_attention_heads", "n_head"])
 4852        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
 4853        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
 4854        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
 4855        rope_dims = int(rot_pct * n_embd) // n_head
 4856
 4857        # write rope scaling for long context (128k) model
 4858        rope_scaling = self.find_hparam(['rope_scaling'], True)
 4859        if rope_scaling is None:
 4860            return
 4861
 4862        scale = max_pos_embds / orig_max_pos_embds
 4863
 4864        rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
 4865        if len(rope_scaling_type) == 0:
 4866            raise KeyError('Missing the required key rope_scaling.type')
 4867
 4868        if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
 4869            attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
 4870        elif rope_scaling_type == 'yarn':
 4871            attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
 4872        else:
 4873            raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
 4874
 4875        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
 4876
 4877        long_factors = rope_scaling.get('long_factor', None)
 4878        short_factors = rope_scaling.get('short_factor', None)
 4879
 4880        if long_factors is None or short_factors is None:
 4881            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
 4882
 4883        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
 4884            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
 4885
 4886        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
 4887        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
 4888
 4889
 4890@ModelBase.register("PhiMoEForCausalLM")
 4891class PhiMoeModel(Phi3MiniModel):
 4892    model_arch = gguf.MODEL_ARCH.PHIMOE
 4893
 4894    _experts: list[dict[str, Tensor]] | None = None
 4895
 4896    def set_gguf_parameters(self):
 4897        super().set_gguf_parameters()
 4898        self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
 4899        self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
 4900
 4901    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4902        # process the experts separately
 4903        if name.find("block_sparse_moe.experts") != -1:
 4904            n_experts = self.hparams["num_local_experts"]
 4905            assert bid is not None
 4906
 4907            if self._experts is None:
 4908                self._experts = [{} for _ in range(self.block_count)]
 4909
 4910            self._experts[bid][name] = data_torch
 4911
 4912            if len(self._experts[bid]) >= n_experts * 3:
 4913                # merge the experts into a single 3d tensor
 4914                for w_name in ["w1", "w2", "w3"]:
 4915                    datas: list[Tensor] = []
 4916
 4917                    for xid in range(n_experts):
 4918                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
 4919                        datas.append(self._experts[bid][ename])
 4920                        del self._experts[bid][ename]
 4921
 4922                    data_torch = torch.stack(datas, dim=0)
 4923
 4924                    merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
 4925
 4926                    yield from super().modify_tensors(data_torch, merged_name, bid)
 4927                return
 4928            else:
 4929                return
 4930
 4931        yield from super().modify_tensors(data_torch, name, bid)
 4932
 4933    def prepare_tensors(self):
 4934        super().prepare_tensors()
 4935
 4936        if self._experts is not None:
 4937            # flatten `list[dict[str, Tensor]]` into `list[str]`
 4938            experts = [k for d in self._experts for k in d.keys()]
 4939            if len(experts) > 0:
 4940                raise ValueError(f"Unprocessed experts: {experts}")
 4941
 4942
 4943@ModelBase.register("PlamoForCausalLM")
 4944class PlamoModel(TextModel):
 4945    model_arch = gguf.MODEL_ARCH.PLAMO
 4946
 4947    def set_vocab(self):
 4948        self._set_vocab_sentencepiece()
 4949
 4950    def set_gguf_parameters(self):
 4951        hparams = self.hparams
 4952
 4953        self.gguf_writer.add_context_length(4096)  # not in config.json
 4954        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
 4955        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 4956        self.gguf_writer.add_block_count(self.block_count)
 4957        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
 4958        self.gguf_writer.add_head_count_kv(5)  # hparams["num_key_value_heads"]) is wrong
 4959        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 4960        self.gguf_writer.add_file_type(self.ftype)
 4961
 4962    def shuffle_attn_q_weight(self, data_torch):
 4963        assert data_torch.size() == (5120, 5120)
 4964        data_torch = data_torch.reshape(8, 5, 128, 5120)
 4965        data_torch = torch.permute(data_torch, (1, 0, 2, 3))
 4966        data_torch = torch.reshape(data_torch, (5120, 5120))
 4967        return data_torch
 4968
 4969    def shuffle_attn_output_weight(self, data_torch):
 4970        assert data_torch.size() == (5120, 5120)
 4971        data_torch = data_torch.reshape(5120, 8, 5, 128)
 4972        data_torch = torch.permute(data_torch, (0, 2, 1, 3))
 4973        data_torch = torch.reshape(data_torch, (5120, 5120))
 4974        return data_torch
 4975
 4976    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 4977        new_name = self.map_tensor_name(name)
 4978
 4979        # shuffle for broadcasting of gqa in ggml_mul_mat
 4980        if new_name.endswith("attn_q.weight"):
 4981            data_torch = self.shuffle_attn_q_weight(data_torch)
 4982        elif new_name.endswith("attn_output.weight"):
 4983            data_torch = self.shuffle_attn_output_weight(data_torch)
 4984
 4985        yield from super().modify_tensors(data_torch, name, bid)
 4986
 4987
 4988@ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM")
 4989class Plamo2Model(TextModel):
 4990    model_arch = gguf.MODEL_ARCH.PLAMO2
 4991
 4992    def set_vocab(self):
 4993        self._set_vocab_plamo()
 4994
 4995    def set_gguf_parameters(self):
 4996        hparams = self.hparams
 4997        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
 4998
 4999        # Which layers are Mamba layers
 5000        # PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer)
 5001        # This logic matches modeling_plamo.py's is_mamba function
 5002        mamba_step = hparams.get("mamba_step", 2)
 5003        mamba_enabled = hparams.get("mamba_enabled", True)
 5004        num_key_value_heads = []
 5005        num_attention_heads = []
 5006
 5007        if mamba_enabled:
 5008            for i in range(self.block_count):
 5009                if self.block_count <= (mamba_step // 2):
 5010                    # use attention in last layer
 5011                    is_mamba = (i != self.block_count - 1)
 5012                else:
 5013                    is_mamba = (i % mamba_step) != (mamba_step // 2)
 5014                if is_mamba:
 5015                    num_key_value_heads.append(0)
 5016                    num_attention_heads.append(0)
 5017                else:
 5018                    num_key_value_heads.append(hparams.get("num_key_value_heads", 4))
 5019                    num_attention_heads.append(hparams.get("num_attention_heads", 32))
 5020
 5021        if num_key_value_heads and num_attention_heads:
 5022            self.gguf_writer.add_head_count_kv(num_key_value_heads)
 5023            self.gguf_writer.add_head_count(num_attention_heads)
 5024
 5025        self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
 5026        self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
 5027        self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128))
 5028        self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
 5029        self.gguf_writer.add_block_count(self.block_count)
 5030        self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
 5031        self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000))
 5032
 5033        # Mamba parameters
 5034        self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
 5035        self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4))
 5036        self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64))
 5037        intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128)
 5038        self.gguf_writer.add_ssm_inner_size(intermediate_size)
 5039        self.gguf_writer.add_ssm_group_count(0)
 5040
 5041        # MLP feed forward parameters (for attention layers)
 5042        self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
 5043        self.gguf_writer.add_file_type(self.ftype)
 5044
 5045    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 5046        if name.endswith(".A_log"):
 5047            data_torch = -torch.exp(data_torch)
 5048        elif name.endswith(".dt_bias"):
 5049            name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
 5050        elif name.endswith(".dt_norm_weight"):
 5051            name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight"
 5052        elif name.endswith(".B_norm_weight"):
 5053            name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight"
 5054        elif name.endswith(".C_norm_weight"):
 5055            name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight"
 5056        elif name.endswith(".k_weight"):
 5057            name = name.rpartition(".k_weight")[0] + ".k.weight"
 5058        elif name.endswith(".q_weight"):
 5059            name = name.rpartition(".q_weight")[0] + ".q.weight"
 5060        elif name.endswith(".conv1d.weight"):
 5061            data_torch = torch.squeeze(data_torch)  # remove (, 1, )
 5062            assert data_torch.ndim == 2
 5063        elif name.endswith(".pre_mixer_norm.weight"):
 5064            data_torch += 1.0
 5065        elif name.endswith(".post_mixer_norm.weight"):
 5066            data_torch += 1.0 / 5
 5067        elif name.endswith(".pre_mlp_norm.weight"):
 5068            data_torch += 1.0
 5069        elif name.endswith(".post_mlp_norm.weight"):
 5070            data_torch += 1.0 / (5**1.5)
 5071        elif name.endswith(".norm.weight"):
 5072            data_torch += 1.0
 5073
 5074        yield from super().modify_tensors(data_torch, name, bid)
 5075
 5076
 5077@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM")
 5078class Plamo3Model(TextModel):
 5079    model_arch = gguf.MODEL_ARCH.PLAMO3
 5080
 5081    def set_vocab(self):
 5082        self._set_vocab_plamo()
 5083
 5084        tokenizer_config_path = self.dir_model / "tokenizer_config.json"
 5085        tokenizer_config = {}
 5086
 5087        if tokenizer_config_path.is_file():
 5088            with open(tokenizer_config_path, encoding="utf-8") as f:
 5089                tokenizer_config = json.load(f)
 5090
 5091        chat_template = tokenizer_config.get("chat_template")
 5092        chat_template_jinja = self.dir_model / "chat_template.jinja"
 5093
 5094        if chat_template_jinja.is_file():
 5095            with open(chat_template_jinja, encoding="utf-8") as f:
 5096                chat_template = f.read()
 5097
 5098        if chat_template:
 5099            self.gguf_writer.add_chat_template(chat_template)
 5100
 5101    def set_gguf_parameters(self):
 5102        super().set_gguf_parameters()
 5103        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
 5104        if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None:
 5105            self.gguf_writer.add_sliding_window(sliding_window)
 5106            self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"])
 5107
 5108    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 5109
 5110        if name.endswith(".pre_mixer_norm.weight"):
 5111            data_torch = data_torch + 1.0
 5112        elif name.endswith(".post_mixer_norm.weight"):
 5113            data_torch = data_torch + 1.0 / 5
 5114        elif name.endswith(".pre_mlp_norm.weight"):
 5115            data_torch = data_torch + 1.0
 5116        elif name.endswith(".post_mlp_norm.weight"):
 5117            data_torch = data_torch + 1.0 / (5**1.5)
 5118        elif name.endswith((".mixer.q_norm.weight", ".mixer.k_norm.weight")):
 5119            data_torch = data_torch + 1.0
 5120        elif name.endswith(".norm.weight"):
 5121            data_torch = data_torch + 1.0
 5122
 5123        yield from super().modify_tensors(data_torch, name, bid)
 5124
 5125
 5126@ModelBase.register("CodeShellForCausalLM")
 5127class CodeShellModel(TextModel):
 5128    model_arch = gguf.MODEL_ARCH.CODESHELL
 5129
 5130    def set_gguf_parameters(self):
 5131        self.gguf_writer.add_context_length(self.hparams["n_positions"])
 5132        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
 5133        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
 5134        self.gguf_writer.add_block_count(self.block_count)
 5135        self.gguf_writer.add_head_count(self.hparams["n_head"])
 5136        self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
 5137        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
 5138        self.gguf_writer.add_file_type(self.ftype)
 5139        self.gguf_writer.add_rope_freq_base(10000.0)
 5140        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
 5141        self.gguf_writer.add_rope_scaling_factor(1.0)
 5142
 5143
 5144@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM")
 5145class KimiLinearModel(TextModel):
 5146    """Kimi-Linear model with hybrid MLA+KDA architecture"""
 5147    model_arch = gguf.MODEL_ARCH.KIMI_LINEAR
 5148
 5149    _experts: list[dict[str, Tensor]] | None = None
 5150
 5151    def set_vocab(self):
 5152        try:
 5153            self._set_vocab_gpt2()
 5154            return
 5155        except Exception:
 5156            pass
 5157
 5158        from transformers import AutoTokenizer
 5159        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
 5160        tokpre = self.get_vocab_base_pre(tokenizer)
 5161
 5162        if tokpre == "kimi-k2":
 5163            # Build merges list using the approach similar to HunYuanMoE
 5164            merges = []
 5165            vocab = {}
 5166            mergeable_ranks = tokenizer.model._mergeable_ranks
 5167            for token, rank in mergeable_ranks.items():
 5168                vocab[QwenModel.token_bytes_to_string(token)] = rank
 5169                if len(token) == 1:
 5170                    continue
 5171                merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
 5172                if len(merged) == 2:
 5173                    merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
 5174            # Build token list
 5175            vocab_size = self.hparams["vocab_size"]
 5176            special_tokens = tokenizer.special_tokens
 5177            reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
 5178            tokens: list[str] = []
 5179            toktypes: list[int] = []
 5180
 5181            for i in range(vocab_size):
 5182                if i not in reverse_vocab:
 5183                    tokens.append(f"[PAD{i}]")
 5184                    toktypes.append(gguf.TokenType.UNUSED)
 5185                else:
 5186                    token = reverse_vocab[i]
 5187                    tokens.append(token)
 5188                    if i in special_tokens.values():
 5189                        toktypes.append(gguf.TokenType.CONTROL)
 5190                    else:
 5191                        toktypes.append(gguf.TokenType.NORMAL)
 5192
 5193            self.gguf_writer.add_tokenizer_model("gpt2")
 5194            self.gguf_writer.add_tokenizer_pre(tokpre)
 5195            self.gguf_writer.add_token_list(tokens)
 5196            self.gguf_writer.add_token_types(toktypes)
 5197            self.gguf_writer.add_token_merges(merges)
 5198
 5199            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
 5200            special_vocab.add_to_gguf(self.gguf_writer)
 5201            # override eos id in config.json with tiktoken eos id
 5202            self.gguf_writer.add_eos_token_id(tokenizer.eos_id)
 5203        else:
 5204            raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
 5205
 5206    def set_gguf_parameters(self):
 5207        # note: To enable MLA KV cache, attention needs to be converted into MQA (ie: GQA with 1 group)
 5208        self.hparams["num_key_value_heads"] = 1
 5209
 5210        super().set_gguf_parameters()
 5211        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
 5212
 5213        # KDA & MLA params
 5214        # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
 5215        linear_attn_config = self.hparams["linear_attn_config"]
 5216        # n_head == 0 for KDA layers, n_head > 0 for MLA layers
 5217        # full_attention_layers list will be used to distingush layer type
 5218        _num_kv_heads = list()
 5219        _full_attn_layers = linear_attn_config["full_attn_layers"]
 5220        for il in range(self.hparams["num_hidden_layers"]):
 5221            if il + 1 in _full_attn_layers:
 5222                _num_kv_heads.append(self.hparams["num_key_value_heads"])
 5223            else:
 5224                _num_kv_heads.append(0)
 5225        assert len(_num_kv_heads) == self.hparams["num_hidden_layers"]
 5226        self.gguf_writer.add_head_count_kv(_num_kv_heads)
 5227
 5228        if (ssm_d_conv := linear_attn_config.get("short_conv_kernel_size")) is not None:
 5229            self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
 5230        if (kda_head_dim := linear_attn_config.get("head_dim")) is not None:
 5231            self.gguf_writer.add_kda_head_dim(kda_head_dim)
 5232
 5233        # MLA params - use add_* methods that handle arch substitution
 5234        # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
 5235        if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=True)) is not None:
 5236            self.gguf_writer.add_q_lora_rank(q_lora_rank)
 5237        # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA
 5238        kv_lora_rank = self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False)
 5239        self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
 5240
 5241        # MLA head dimensions
 5242        # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
 5243        qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
 5244        # Rotation - use qk_rope_head_dim for Kimi
 5245        qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=False)
 5246        self.gguf_writer.add_rope_dimension_count(qk_rope_head_dim)
 5247        self.gguf_writer.add_key_length(kv_lora_rank + qk_rope_head_dim)
 5248        v_head_dim = self.hparams.get("v_head_dim")
 5249
 5250        # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
 5251        if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None:
 5252            self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
 5253        elif qk_nope_head_dim is not None:
 5254            n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
 5255            self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
 5256
 5257        # n_embd_head_v_mla = v_head_dim
 5258        if (n_embd_head_v_mla := self.hparams.get("n_embd_head_v_mla")) is not None:
 5259            self.gguf_writer.add_value_length_mla(n_embd_head_v_mla)
 5260        elif v_head_dim is not None:
 5261            self.gguf_writer.add_value_length_mla(v_head_dim)
 5262
 5263        # moe_intermediate_size (1024 for Kimi)
 5264        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
 5265        # num_shared_experts (1 for Kimi)
 5266        self.gguf_writer.add_expert_shared_count(self.hparams["num_shared_experts"])
 5267        # first_k_dense_replace (1 for Kimi - first layer uses dense MLP)
 5268        self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
 5269        # Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
 5270        self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
 5271
 5272    def prepare_tensors(self):
 5273        super().prepare_tensors()
 5274        if self._experts is not None:
 5275            experts = [k for d in self._experts for k in d.keys()]
 5276            if len(experts) > 0:
 5277                raise ValueError(f"Unprocessed experts: {experts}")
 5278
 5279    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 5280        logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}")
 5281
 5282        # Handle KDA conv1d weights
 5283        # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest
 5284        # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest
 5285        # GGUF reverses numpy shape when writing, so numpy (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1]
 5286        # Memory layouts match: both have conv_step (d_conv) changing fastest
 5287        if name.endswith((".q_conv1d.weight", ".k_conv1d.weight", ".v_conv1d.weight")):
 5288            # HF shape: [d_inner, d_conv] e.g. [4096, 4]
 5289            # Target numpy shape: (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1]
 5290            if data_torch.ndim == 2:
 5291                d_inner, d_conv = data_torch.shape
 5292                # Reshape to (1, d_inner, 1, d_conv) - memory layout preserved (d_conv fastest)
 5293                data_torch = data_torch.reshape(1, d_inner, 1, d_conv)
 5294                logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]")
 5295            elif data_torch.ndim == 3:
 5296                # Already 3D [d_inner, 1, d_conv] from unsqueeze
 5297                d_inner, _, d_conv = data_torch.shape
 5298                data_torch = data_torch.reshape(1, d_inner, 1, d_conv)
 5299                logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]")
 5300
 5301        # Kimi specific bias
 5302        if name.endswith("e_score_correction_bias"):
 5303            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 5304
 5305        # Handle A_log: iHF stores as [1, 1, num_heads, 1]
 5306        # llama.cpp expects ggml ne = [1, num_heads, 1, 1]
 5307        # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1]
 5308        if name.endswith(".A_log"):
 5309            data_torch = -torch.exp(data_torch)
 5310        if name.endswith(".dt_bias"):
 5311            name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
 5312            logger.info("Changed dt_bias to dt_proj.bias")
 5313
 5314        # process the experts separately
 5315        if name.find("block_sparse_moe.experts") != -1:
 5316            n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=False)
 5317            assert bid is not None
 5318
 5319            if self._experts is None:
 5320                self._experts = [{} for _ in range(self.block_count)]
 5321
 5322            self._experts[bid][name] = data_torch
 5323
 5324            if len(self._experts[bid]) >= n_experts * 3:
 5325                # merge the experts into a single 3d tensor
 5326                # w1: gate, w2: down, w3: up
 5327                for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP),
 5328                                   ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP),
 5329                                   ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]:
 5330                    datas: list[Tensor] = []
 5331                    for xid in range(n_experts):
 5332                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
 5333                        datas.append(self._experts[bid][ename])
 5334                        del self._experts[bid][ename]
 5335                    data_torch = torch.stack(datas, dim=0)
 5336                    new_name = self.format_tensor_name(tname, bid)
 5337                    yield from super().modify_tensors(data_torch, new_name, bid)
 5338            return
 5339
 5340        # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
 5341        if name.endswith("kv_b_proj.weight"):
 5342            name_kb = name.replace("kv_b_proj", "k_b_proj")
 5343            name_vb = name.replace("kv_b_proj", "v_b_proj")
 5344            n_head_kv = self.hparams["num_key_value_heads"]
 5345            v_head_dim = self.find_hparam(["n_embd_head_v_mla", "v_head_dim"], optional=False)
 5346            qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
 5347            logger.info("Split kv_b n_head_kv %d\n" % n_head_kv)
 5348            assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
 5349            kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
 5350            k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
 5351            k_b = k_b.transpose(1, 2)
 5352            yield from super().modify_tensors(k_b, name_kb, bid)
 5353            yield from super().modify_tensors(v_b, name_vb, bid)
 5354            return
 5355
 5356        yield from super().modify_tensors(data_torch, name, bid)
 5357
 5358
 5359@ModelBase.register("InternLM2ForCausalLM")
 5360class InternLM2Model(TextModel):
 5361    model_arch = gguf.MODEL_ARCH.INTERNLM2
 5362
 5363    def set_vocab(self):
 5364        # (TODO): Is there a better way?
 5365        # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
 5366        # \x00 specially and convert it into an emoji character to prevent it from being mistakenly
 5367        # recognized as an empty string in C++.
 5368        from sentencepiece import SentencePieceProcessor
 5369        from sentencepiece import sentencepiece_model_pb2 as model
 5370
 5371        tokenizer_path = self.dir_model / 'tokenizer.model'
 5372
 5373        tokens: list[bytes] = []
 5374        scores: list[float] = []
 5375        toktypes: list[int] = []
 5376
 5377        if not tokenizer_path.is_file():
 5378            logger.error(f'Error: Missing {tokenizer_path}')
 5379            sys.exit(1)
 5380
 5381        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
 5382        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 5383        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
 5384
 5385        tokenizer = SentencePieceProcessor()
 5386        tokenizer.LoadFromFile(str(tokenizer_path))
 5387
 5388        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
 5389
 5390        for token_id in range(vocab_size):
 5391            piece = tokenizer.IdToPiece(token_id)
 5392            text = piece.encode("utf-8")
 5393            score = tokenizer.GetScore(token_id)
 5394            if text == b"\x00":
 5395                # (TODO): fixme
 5396                # Hack here and replace the \x00 characters.
 5397                logger.warning(f"InternLM2 convert token '{text}' to '๐Ÿ‰'!")
 5398                text = "๐Ÿ‰".encode("utf-8")
 5399
 5400            toktype = SentencePieceTokenTypes.NORMAL
 5401            if tokenizer.IsUnknown(token_id):
 5402                toktype = SentencePieceTokenTypes.UNKNOWN
 5403            elif tokenizer.IsControl(token_id):
 5404                toktype = SentencePieceTokenTypes.CONTROL
 5405            elif tokenizer.IsUnused(token_id):
 5406                toktype = SentencePieceTokenTypes.UNUSED
 5407            elif tokenizer.IsByte(token_id):
 5408                toktype = SentencePieceTokenTypes.BYTE
 5409            # take care of ununsed raw token
 5410            if piece.startswith('[UNUSED'):
 5411                toktype = SentencePieceTokenTypes.UNUSED
 5412
 5413            tokens.append(text)
 5414            scores.append(score)
 5415            toktypes.append(toktype)
 5416
 5417        added_tokens_file = self.dir_model / 'added_tokens.json'
 5418        if added_tokens_file.is_file():
 5419            with open(added_tokens_file, "r", encoding="utf-8") as f:
 5420                added_tokens_json = json.load(f)
 5421
 5422                for key in added_tokens_json:
 5423                    tokens.append(key.encode("utf-8"))
 5424                    scores.append(-1000.0)
 5425                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
 5426
 5427        chat_eos_token = '<|im_end|>'
 5428        chat_eos_token_id = None
 5429
 5430        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
 5431        if tokenizer_config_file.is_file():
 5432            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
 5433                tokenizer_config_json = json.load(f)
 5434                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
 5435                for token_id, foken_data in added_tokens_decoder.items():
 5436                    token_id = int(token_id)
 5437                    token = foken_data["content"]
 5438                    if token == chat_eos_token:
 5439                        chat_eos_token_id = token_id
 5440                    token = token.encode("utf-8")
 5441                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
 5442                        if tokens[token_id] != token:
 5443                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
 5444                    tokens[token_id] = token
 5445                    scores[token_id] = -1000.0
 5446                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 5447                    if foken_data.get("special"):
 5448                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
 5449
 5450        tokenizer_file = self.dir_model / 'tokenizer.json'
 5451        if tokenizer_file.is_file():
 5452            with open(tokenizer_file, "r", encoding="utf-8") as f:
 5453                tokenizer_json = json.load(f)
 5454                added_tokens = tokenizer_json.get("added_tokens", [])
 5455                for foken_data in added_tokens:
 5456                    token_id = int(foken_data["id"])
 5457                    token = foken_data["content"]
 5458                    if token == chat_eos_token:
 5459                        chat_eos_token_id = token_id
 5460                    token = token.encode("utf-8")
 5461                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
 5462                        if tokens[token_id] != token:
 5463                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
 5464                    tokens[token_id] = token
 5465                    scores[token_id] = -1000.0
 5466                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 5467                    if foken_data.get("special"):
 5468                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
 5469
 5470        self.gguf_writer.add_tokenizer_model("llama")
 5471        self.gguf_writer.add_tokenizer_pre("default")
 5472        self.gguf_writer.add_token_list(tokens)
 5473        self.gguf_writer.add_token_scores(scores)
 5474        self.gguf_writer.add_token_types(toktypes)
 5475        self.gguf_writer.add_add_space_prefix(add_prefix)
 5476
 5477        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 5478        old_eos = special_vocab.special_token_ids["eos"]
 5479        if chat_eos_token_id is not None:
 5480            # For the chat model, we replace the eos with '<|im_end|>'.
 5481            # TODO: this is a hack, should be fixed
 5482            #       https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
 5483            special_vocab.special_token_ids["eos"] = chat_eos_token_id
 5484            logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
 5485                           " in chat mode so that the conversation can end normally.")
 5486
 5487        special_vocab.add_to_gguf(self.gguf_writer)
 5488
 5489    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 5490        num_heads = self.hparams["num_attention_heads"]
 5491        num_kv_heads = self.hparams["num_key_value_heads"]
 5492        n_embd = self.hparams["hidden_size"]
 5493        q_per_kv = num_heads // num_kv_heads
 5494        head_dim = n_embd // num_heads
 5495        num_groups = num_heads // q_per_kv
 5496
 5497        name = name.replace("language_model.", "") # InternVL
 5498        if name.startswith("mlp") or name.startswith("vision_model"):
 5499            # skip visual tensors
 5500            return
 5501
 5502        if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
 5503            qkv = data_torch
 5504
 5505            qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
 5506            q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
 5507
 5508            # The model weights of q and k equire additional reshape.
 5509            q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
 5510            k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
 5511            v = v.reshape((-1, v.shape[-1]))
 5512
 5513            yield from super().modify_tensors(q, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid)
 5514            yield from super().modify_tensors(k, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid)
 5515            yield from super().modify_tensors(v, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid)
 5516        else:
 5517            yield from super().modify_tensors(data_torch, name, bid)
 5518
 5519
 5520@ModelBase.register("InternLM3ForCausalLM")
 5521class InternLM3Model(TextModel):
 5522    model_arch = gguf.MODEL_ARCH.LLAMA
 5523
 5524    def set_vocab(self):
 5525        tokens, scores, toktypes = self._create_vocab_sentencepiece()
 5526
 5527        self.gguf_writer.add_tokenizer_model("llama")
 5528        self.gguf_writer.add_tokenizer_pre("default")
 5529        self.gguf_writer.add_token_list(tokens)
 5530        self.gguf_writer.add_token_scores(scores)
 5531        self.gguf_writer.add_token_types(toktypes)
 5532
 5533        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 5534
 5535        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
 5536        if tokenizer_config_file.is_file():
 5537            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
 5538                tokenizer_config_json = json.load(f)
 5539                if "add_prefix_space" in tokenizer_config_json:
 5540                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
 5541
 5542                if "added_tokens_decoder" in tokenizer_config_json:
 5543                    for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
 5544                        if token_data.get("special"):
 5545                            token_id = int(token_id)
 5546                            token = token_data["content"]
 5547                            special_vocab._set_special_token(token, token_id)
 5548                            # update eos token
 5549                            if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
 5550                                special_vocab.special_token_ids["eos"] = token_id
 5551
 5552        special_vocab.add_to_gguf(self.gguf_writer)
 5553
 5554    def set_gguf_parameters(self):
 5555        super().set_gguf_parameters()
 5556        hparams = self.hparams
 5557        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 5558
 5559        if (rope_dim := hparams.get("head_dim")) is None:
 5560            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
 5561        self.gguf_writer.add_rope_dimension_count(rope_dim)
 5562
 5563    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 5564        n_head = self.hparams["num_attention_heads"]
 5565        n_kv_head = self.hparams.get("num_key_value_heads")
 5566        name = name.replace("language_model.", "") # InternVL
 5567        if name.startswith("mlp") or name.startswith("vision_model"):
 5568            # skip visual tensors
 5569            return
 5570        if name.endswith(("q_proj.weight", "q_proj.bias")):
 5571            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
 5572        if name.endswith(("k_proj.weight", "k_proj.bias")):
 5573            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 5574        yield from super().modify_tensors(data_torch, name, bid)
 5575
 5576
 5577@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
 5578class BertModel(TextModel):
 5579    model_arch = gguf.MODEL_ARCH.BERT
 5580
 5581    def __init__(self, *args, **kwargs):
 5582        super().__init__(*args, **kwargs)
 5583        self.vocab_size = None
 5584
 5585        if cls_out_labels := self.hparams.get("id2label"):
 5586            if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0":
 5587                # Remove dummy labels added by AutoConfig
 5588                cls_out_labels = None
 5589        self.cls_out_labels = cls_out_labels
 5590
 5591    def set_gguf_parameters(self):
 5592        super().set_gguf_parameters()
 5593        self.gguf_writer.add_causal_attention(False)
 5594        self._try_set_pooling_type()
 5595
 5596        if self.cls_out_labels:
 5597            self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
 5598
 5599    def set_vocab(self):
 5600        tokens, toktypes, tokpre = self.get_vocab_base()
 5601        self.vocab_size = len(tokens)
 5602
 5603        # we need this to validate the size of the token_type embeddings
 5604        # though currently we are passing all zeros to the token_type embeddings
 5605        # "Sequence A" or "Sequence B"
 5606        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
 5607
 5608        # convert to phantom space vocab
 5609        def phantom(tok, toktype):
 5610            if toktype == gguf.TokenType.CONTROL:
 5611                return tok
 5612            if tok.startswith("##"):
 5613                return tok[2:]
 5614            return "\u2581" + tok
 5615        assert len(tokens) == len(toktypes)
 5616        tokens = list(map(phantom, tokens, toktypes))
 5617
 5618        # add vocab to gguf
 5619        self.gguf_writer.add_tokenizer_model("bert")
 5620        self.gguf_writer.add_tokenizer_pre(tokpre)
 5621        self.gguf_writer.add_token_list(tokens)
 5622        self.gguf_writer.add_token_types(toktypes)
 5623
 5624        # handle special tokens
 5625        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 5626        special_vocab.add_to_gguf(self.gguf_writer)
 5627
 5628    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 5629        if name.startswith("bert."):
 5630            name = name[5:]
 5631
 5632        if name.endswith(".gamma"):
 5633            name = name[:-6] + ".weight"
 5634
 5635        if name.endswith(".beta"):
 5636            name = name[:-5] + ".bias"
 5637
 5638        # we are only using BERT for embeddings so we don't need the pooling layer
 5639        if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
 5640            return # we don't need these
 5641
 5642        if name.startswith("cls.predictions"):
 5643            return
 5644
 5645        if name.startswith("cls.seq_relationship"):
 5646            return
 5647
 5648        if self.cls_out_labels:
 5649            # For BertForSequenceClassification (direct projection layer)
 5650            if name == "classifier.weight":
 5651                name = "classifier.out_proj.weight"
 5652
 5653            if name == "classifier.bias":
 5654                name = "classifier.out_proj.bias"
 5655
 5656        yield from super().modify_tensors(data_torch, name, bid)
 5657
 5658    def _xlmroberta_tokenizer_init(self) -> None:
 5659        # we need the pad_token_id to know how to chop down position_embd matrix
 5660        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
 5661            self._position_offset = 1 + pad_token_id
 5662            if "max_position_embeddings" in self.hparams:
 5663                self.hparams["max_position_embeddings"] -= self._position_offset
 5664        else:
 5665            self._position_offset = None
 5666
 5667    def _xlmroberta_set_vocab(self) -> None:
 5668        # to avoid TypeError: Descriptors cannot be created directly
 5669        # exception when importing sentencepiece_model_pb2
 5670        os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
 5671        from sentencepiece import SentencePieceProcessor
 5672        from sentencepiece import sentencepiece_model_pb2 as model
 5673
 5674        tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
 5675
 5676        tokenizer_json = {}
 5677        tokenizer_config_json = {}
 5678        if not tokenizer_path.is_file():
 5679            tokenizer_path = self.dir_model / 'tokenizer.json'
 5680            tokenizer_config_path = self.dir_model / 'tokenizer_config.json'
 5681
 5682            if not tokenizer_path.is_file():
 5683                raise FileNotFoundError(f"File not found: {tokenizer_path}")
 5684
 5685            from base64 import b64decode
 5686            from transformers import AutoTokenizer
 5687            tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
 5688
 5689            with open(tokenizer_path, "r", encoding="utf-8") as fp:
 5690                tokenizer_json = json.load(fp)
 5691
 5692            if tokenizer_config_path.is_file():
 5693                with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
 5694                    tokenizer_config_json = json.load(fp)
 5695
 5696            add_prefix = tokenizer.add_prefix_space
 5697            remove_whitespaces = tokenizer.clean_up_tokenization_spaces
 5698            precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
 5699
 5700            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
 5701        else:
 5702            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
 5703            sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 5704            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
 5705
 5706            add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
 5707            remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
 5708            precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
 5709
 5710            tokenizer = SentencePieceProcessor()
 5711            tokenizer.LoadFromFile(str(tokenizer_path))
 5712
 5713            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
 5714
 5715        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
 5716        scores: list[float] = [-10000.0] * vocab_size
 5717        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
 5718
 5719        if isinstance(tokenizer, SentencePieceProcessor):
 5720            for token_id in range(tokenizer.vocab_size()):
 5721                piece = tokenizer.IdToPiece(token_id)
 5722                text = piece.encode("utf-8")
 5723                score = tokenizer.GetScore(token_id)
 5724
 5725                toktype = SentencePieceTokenTypes.NORMAL
 5726                if tokenizer.IsUnknown(token_id):
 5727                    toktype = SentencePieceTokenTypes.UNKNOWN
 5728                elif tokenizer.IsControl(token_id):
 5729                    toktype = SentencePieceTokenTypes.CONTROL
 5730                elif tokenizer.IsUnused(token_id):
 5731                    toktype = SentencePieceTokenTypes.UNUSED
 5732                elif tokenizer.IsByte(token_id):
 5733                    toktype = SentencePieceTokenTypes.BYTE
 5734
 5735                tokens[token_id] = text
 5736                scores[token_id] = score
 5737                toktypes[token_id] = toktype
 5738        else:
 5739            added_vocab = tokenizer.get_added_vocab()
 5740            unk_token = tokenizer_config_json.get("unk_token")
 5741            unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
 5742
 5743            for token_id in range(tokenizer.vocab_size):
 5744                piece = tokenizer._convert_id_to_token(token_id)
 5745                if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
 5746                    text = piece.encode("utf-8")
 5747                    score = tokenizer_json["model"]["vocab"][token_id][1]
 5748
 5749                    toktype = SentencePieceTokenTypes.NORMAL
 5750                    if token_id == unk_token_id:
 5751                        toktype = SentencePieceTokenTypes.UNKNOWN
 5752                    elif token_id in tokenizer.all_special_ids:
 5753                        toktype = SentencePieceTokenTypes.CONTROL
 5754                    elif token_id in added_vocab.values():
 5755                        toktype = SentencePieceTokenTypes.USER_DEFINED
 5756                    # No reliable way to detect this, but jina doesn't have any
 5757                    # elif tokenizer.IsByte(token_id):
 5758                    #     toktype = SentencePieceTokenTypes.BYTE
 5759
 5760                    tokens[token_id] = text
 5761                    scores[token_id] = score
 5762                    toktypes[token_id] = toktype
 5763
 5764        if isinstance(tokenizer, SentencePieceProcessor):
 5765            # realign tokens (see HF tokenizer code)
 5766            tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
 5767            scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
 5768            toktypes = [
 5769                SentencePieceTokenTypes.CONTROL,
 5770                SentencePieceTokenTypes.CONTROL,
 5771                SentencePieceTokenTypes.CONTROL,
 5772                SentencePieceTokenTypes.UNKNOWN,
 5773            ] + toktypes[3:-1]
 5774
 5775            if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
 5776                # Add mask token missing from sentencepiece.bpe.model
 5777                tokens[250001] = b'<mask>'
 5778                scores[250001] = 0.0
 5779                toktypes[250001] = SentencePieceTokenTypes.CONTROL
 5780
 5781        self.gguf_writer.add_tokenizer_model("t5")
 5782        self.gguf_writer.add_tokenizer_pre("default")
 5783        self.gguf_writer.add_token_list(tokens)
 5784        self.gguf_writer.add_token_scores(scores)
 5785        self.gguf_writer.add_token_types(toktypes)
 5786        self.gguf_writer.add_add_space_prefix(add_prefix)
 5787        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
 5788        self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
 5789        if precompiled_charsmap:
 5790            self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
 5791
 5792        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 5793        special_vocab.add_to_gguf(self.gguf_writer)
 5794
 5795
 5796@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
 5797class DistilBertModel(BertModel):
 5798    model_arch = gguf.MODEL_ARCH.BERT
 5799
 5800    def set_gguf_parameters(self):
 5801        self.gguf_writer.add_layer_norm_eps(1e-12)
 5802        logger.info("gguf: layer norm epsilon = 1e-12")
 5803        super().set_gguf_parameters()
 5804
 5805    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 5806        if name.startswith("distilbert."):
 5807            name = name[11:]
 5808
 5809        # These layers act as MLM head, so we don't need them
 5810        if name.startswith("vocab_"):
 5811            return
 5812
 5813        yield from super().modify_tensors(data_torch, name, bid)
 5814
 5815
 5816@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
 5817class RobertaModel(BertModel):
 5818    model_arch = gguf.MODEL_ARCH.BERT
 5819
 5820    def __init__(self, *args, **kwargs):
 5821        super().__init__(*args, **kwargs)
 5822
 5823        # we need the pad_token_id to know how to chop down position_embd matrix
 5824        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
 5825            self._position_offset = 1 + pad_token_id
 5826            if "max_position_embeddings" in self.hparams:
 5827                self.hparams["max_position_embeddings"] -= self._position_offset
 5828        else:
 5829            self._position_offset = None
 5830
 5831    def set_vocab(self):
 5832        """Support BPE tokenizers for roberta models"""
 5833        bpe_tok_path = self.dir_model / "tokenizer.json"
 5834        if bpe_tok_path.exists():
 5835            self._set_vocab_gpt2()
 5836
 5837            # we need this to validate the size of the token_type embeddings
 5838            # though currently we are passing all zeros to the token_type embeddings
 5839            # "Sequence A" or "Sequence B"
 5840            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
 5841
 5842        else:
 5843            return super().set_vocab()
 5844
 5845    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 5846        # if name starts with "roberta.", remove the prefix
 5847        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
 5848        if name.startswith("roberta."):
 5849            name = name[8:]
 5850
 5851        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
 5852        if name == "embeddings.position_embeddings.weight":
 5853            if self._position_offset is not None:
 5854                data_torch = data_torch[self._position_offset:,:]
 5855
 5856        yield from super().modify_tensors(data_torch, name, bid)
 5857
 5858
 5859@ModelBase.register("NomicBertModel")
 5860class NomicBertModel(BertModel):
 5861    model_arch = gguf.MODEL_ARCH.BERT
 5862
 5863    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
 5864        hparams = kwargs.pop("hparams", None)
 5865        if hparams is None:
 5866            hparams = ModelBase.load_hparams(dir_model, False)
 5867
 5868        self.is_moe = bool(hparams.get("moe_every_n_layers"))
 5869        self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
 5870
 5871        super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
 5872
 5873        self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
 5874        if self._tokenizer_is_xlmroberta:
 5875            self._xlmroberta_tokenizer_init()
 5876
 5877        npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048)
 5878        if npos == 8192 and mtp == 2048:
 5879            self.hparams["n_positions"] = 2048  # nomic-embed-text v1 and v1.5 are trained for 2048 tokens.
 5880        elif npos == 2048 and mtp == 2048:
 5881            self.hparams["n_positions"] = 512   # nomic-embed-text-v2-moe is trained for 512 tokens.
 5882        else:
 5883            raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}")
 5884
 5885        assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
 5886
 5887        # this doesn't do anything in the HF version
 5888        assert self.hparams["causal"] is False
 5889        # no bias tensors unless MoE
 5890        assert self.hparams["qkv_proj_bias"] == self.is_moe
 5891        assert self.hparams["mlp_fc1_bias"]  == self.is_moe
 5892        assert self.hparams["mlp_fc2_bias"]  == self.is_moe
 5893
 5894        # norm at end of layer
 5895        assert self.hparams["prenorm"] is False
 5896        # standard RoPE
 5897        assert self.hparams["rotary_emb_fraction"] == 1.0
 5898        assert self.hparams["rotary_emb_interleaved"] is False
 5899        assert self.hparams["rotary_emb_scale_base"] is None
 5900
 5901    def set_vocab(self) -> None:
 5902        if self._tokenizer_is_xlmroberta:
 5903            return self._xlmroberta_set_vocab()
 5904        return super().set_vocab()
 5905
 5906    def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
 5907        # If the tensor is an experts bias tensor, skip it by returning an empty list.
 5908        if "mlp.experts.bias" in name:
 5909            return # Explicitly return.
 5910
 5911        if "mlp.experts.mlp.w1" in name:
 5912            data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
 5913            name += ".weight"
 5914
 5915        if "mlp.experts.mlp.w2" in name:
 5916            data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
 5917            data_torch = data_torch.transpose(1, 2)
 5918            name += ".weight"
 5919
 5920        yield from super().modify_tensors(data_torch, name, bid)
 5921
 5922    def set_gguf_parameters(self):
 5923        super().set_gguf_parameters()
 5924        if self.is_moe:
 5925            self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
 5926            self.gguf_writer.add_expert_count(self.hparams["num_experts"])
 5927            self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
 5928
 5929    def _is_tokenizer_xlmroberta(self) -> bool:
 5930        with open(self.dir_model / "tokenizer.json") as f:
 5931            tokenizer_json = json.load(f)
 5932        toktyp = tokenizer_json["model"]["type"]
 5933        if toktyp == "Unigram":
 5934            return True
 5935        if toktyp == "WordPiece":
 5936            return False
 5937        raise ValueError(f"unknown tokenizer: {toktyp}")
 5938
 5939
 5940@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
 5941class NeoBert(BertModel):
 5942    model_arch = gguf.MODEL_ARCH.NEO_BERT
 5943
 5944    def set_gguf_parameters(self):
 5945        super().set_gguf_parameters()
 5946
 5947        # NeoBERT uses 2/3 of the intermediate size as feed forward length
 5948        self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
 5949        self.gguf_writer.add_rope_freq_base(10000.0)  # default value for NeoBERT
 5950        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
 5951
 5952        f_rms_eps = self.hparams.get("norm_eps", 1e-6)  # default value for NeoBERT
 5953        self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
 5954        logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
 5955
 5956        self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
 5957
 5958    def modify_tensors(self, data_torch, name, bid):
 5959        if name.startswith("decoder."):
 5960            return
 5961
 5962        if name.startswith("model."):
 5963            name = name[6:]
 5964
 5965        yield from super().modify_tensors(data_torch, name, bid)
 5966
 5967
 5968@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
 5969class XLMRobertaModel(BertModel):
 5970    model_arch = gguf.MODEL_ARCH.BERT
 5971    _lora_files = {}
 5972    _lora_names = []
 5973
 5974    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
 5975        hparams = kwargs.pop("hparams", None)
 5976        if hparams is None:
 5977            hparams = ModelBase.load_hparams(dir_model, False)
 5978
 5979        if lora_names := hparams.get("lora_adaptations"):
 5980            self._lora_names = lora_names
 5981            self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
 5982
 5983        super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
 5984        self._xlmroberta_tokenizer_init()
 5985
 5986    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 5987        if self._lora_names:
 5988            for name in self._lora_names:
 5989                fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-")
 5990                self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run)
 5991
 5992        return super().generate_extra_tensors()
 5993
 5994    def set_type(self):
 5995        for lora_writer in self._lora_files.values():
 5996            lora_writer.add_type(gguf.GGUFType.ADAPTER)
 5997            lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
 5998        super().set_type()
 5999
 6000    def set_vocab(self):
 6001        self._xlmroberta_set_vocab()
 6002
 6003    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6004        # if name starts with "roberta.", remove the prefix
 6005        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
 6006        if name.startswith("roberta."):
 6007            name = name[8:]
 6008
 6009        # jina-embeddings-v3
 6010        if ".parametrizations." in name:
 6011            name = name.replace(".parametrizations.", ".")
 6012            if name.endswith(".original"):
 6013                name = name[:-9]
 6014
 6015        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
 6016        if name == "embeddings.position_embeddings.weight":
 6017            if self._position_offset is not None:
 6018                data_torch = data_torch[self._position_offset:,:]
 6019
 6020        if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"):
 6021            if name.startswith("pooler.dense"):
 6022                return
 6023
 6024            num_loras = data_torch.size(0)
 6025            assert num_loras == len(self._lora_names)
 6026
 6027            # Split out each LoRA in their own GGUF
 6028            for i, lora_writer in enumerate(self._lora_files.values()):
 6029                new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower()
 6030                data = data_torch[i, :, :]
 6031                # Transpose/flip token_embd/types into correct shape
 6032                if new_name == "token_embd.weight.lora_b":
 6033                    data = data.T
 6034                elif new_name.startswith("token_types.weight."):
 6035                    new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b")
 6036                lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32)
 6037
 6038            return
 6039
 6040        yield from super().modify_tensors(data_torch, name, bid)
 6041
 6042    def set_gguf_parameters(self):
 6043        super().set_gguf_parameters()
 6044
 6045        # jina-embeddings-v3
 6046        lora_alpha = self.hparams.get("lora_alpha")
 6047        if lora_prompt_prefixes := self.hparams.get("task_instructions"):
 6048            assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
 6049        for lora_name, lora_writer in self._lora_files.items():
 6050            lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0)
 6051            lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name)
 6052            if lora_prompt_prefixes:
 6053                lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name])
 6054
 6055    def write(self):
 6056        super().write()
 6057        for lora_writer in self._lora_files.values():
 6058            lora_writer.write_header_to_file()
 6059            lora_writer.write_kv_data_to_file()
 6060            lora_writer.write_tensors_to_file(progress=True)
 6061            lora_writer.close()
 6062
 6063
 6064@ModelBase.register("GemmaForCausalLM")
 6065class GemmaModel(TextModel):
 6066    model_arch = gguf.MODEL_ARCH.GEMMA
 6067
 6068    def set_vocab(self):
 6069        self._set_vocab_sentencepiece()
 6070
 6071        # TODO: these special tokens should be exported only for the CodeGemma family
 6072        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
 6073                                          special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
 6074        special_vocab._set_special_token("prefix", 67)
 6075        special_vocab._set_special_token("suffix", 69)
 6076        special_vocab._set_special_token("middle", 68)
 6077        special_vocab._set_special_token("fsep",   70)
 6078        special_vocab._set_special_token("eot",    107)
 6079        special_vocab.chat_template = None  # do not add it twice
 6080        special_vocab.add_to_gguf(self.gguf_writer)
 6081
 6082        self.gguf_writer.add_add_space_prefix(False)
 6083
 6084    def set_gguf_parameters(self):
 6085        hparams = self.hparams
 6086
 6087        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
 6088        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
 6089        self.gguf_writer.add_block_count(self.block_count)
 6090        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 6091        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
 6092        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
 6093        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
 6094        self.gguf_writer.add_key_length(hparams["head_dim"])
 6095        self.gguf_writer.add_value_length(hparams["head_dim"])
 6096        self.gguf_writer.add_file_type(self.ftype)
 6097
 6098    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6099        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
 6100        # To prevent errors, skip loading lm_head.weight.
 6101        if name == "lm_head.weight":
 6102            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
 6103            return
 6104
 6105        # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
 6106        if name.endswith("norm.weight"):
 6107            data_torch = data_torch + 1
 6108
 6109        yield from super().modify_tensors(data_torch, name, bid)
 6110
 6111
 6112@ModelBase.register("Gemma2ForCausalLM")
 6113class Gemma2Model(TextModel):
 6114    model_arch = gguf.MODEL_ARCH.GEMMA2
 6115
 6116    def set_vocab(self):
 6117        self._set_vocab_sentencepiece()
 6118
 6119        self.gguf_writer.add_add_space_prefix(False)
 6120
 6121    def set_gguf_parameters(self):
 6122        hparams = self.hparams
 6123
 6124        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
 6125        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
 6126        self.gguf_writer.add_block_count(self.block_count)
 6127        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 6128        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
 6129        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
 6130        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
 6131        self.gguf_writer.add_key_length(hparams["head_dim"])
 6132        self.gguf_writer.add_value_length(hparams["head_dim"])
 6133        self.gguf_writer.add_file_type(self.ftype)
 6134        self.gguf_writer.add_attn_logit_softcapping(
 6135            self.hparams["attn_logit_softcapping"]
 6136        )
 6137        self.gguf_writer.add_final_logit_softcapping(
 6138            self.hparams["final_logit_softcapping"]
 6139        )
 6140        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
 6141
 6142    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6143        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
 6144        # To prevent errors, skip loading lm_head.weight.
 6145        if name == "lm_head.weight":
 6146            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
 6147            return
 6148
 6149        # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
 6150        if name.endswith("norm.weight"):
 6151            data_torch = data_torch + 1
 6152
 6153        yield from super().modify_tensors(data_torch, name, bid)
 6154
 6155
 6156@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
 6157class Gemma3Model(TextModel):
 6158    model_arch = gguf.MODEL_ARCH.GEMMA3
 6159    norm_shift = 1.0  # Gemma3RMSNorm adds 1.0 to the norm value
 6160
 6161    def set_vocab(self):
 6162        if (self.dir_model / "tokenizer.model").is_file():
 6163            self._set_vocab_sentencepiece()
 6164            self.gguf_writer.add_add_space_prefix(False)
 6165        else:
 6166            self._set_vocab_gpt2()
 6167
 6168    def set_gguf_parameters(self):
 6169        super().set_gguf_parameters()
 6170        hparams = self.hparams
 6171
 6172        # some default values are not specified in the hparams
 6173        self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
 6174        self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
 6175        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
 6176        self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
 6177        self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
 6178        self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers
 6179        # attn_logit_softcapping is removed in Gemma3
 6180        assert hparams.get("attn_logit_softcapping") is None
 6181        if (final_logit_softcap := hparams.get("final_logit_softcapping")):
 6182            self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
 6183        if hparams.get("sliding_window_pattern") != 1:
 6184            self.gguf_writer.add_sliding_window(hparams["sliding_window"])
 6185        self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
 6186
 6187    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6188        if "language_model." in name:
 6189            name = name.replace("language_model.", "")
 6190
 6191        elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
 6192                or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
 6193            return # skip vision tensors
 6194
 6195        # remove OOV (out-of-vocabulary) rows in token_embd
 6196        if "embed_tokens.weight" in name:
 6197            if (self.dir_model / "tokenizer.model").is_file():
 6198                tokens = self._create_vocab_sentencepiece()[0]
 6199            else:
 6200                tokens = self.get_vocab_base()[0]
 6201            data_torch = data_torch[:len(tokens)]
 6202
 6203        # ref code in Gemma3RMSNorm
 6204        # output = output * (1.0 + self.weight.float())
 6205        # note: this is not the case on gemma3n
 6206        if name.endswith("norm.weight"):
 6207            data_torch = data_torch + self.norm_shift
 6208
 6209        yield from super().modify_tensors(data_torch, name, bid)
 6210
 6211
 6212@ModelBase.register("Gemma3TextModel")
 6213class EmbeddingGemma(Gemma3Model):
 6214    model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
 6215    module_paths = []
 6216    dense_features_dims = {}
 6217
 6218    def __init__(self, *args, **kwargs):
 6219        super().__init__(*args, **kwargs)
 6220        if self.sentence_transformers_dense_modules:
 6221            # read modules.json to determine if model has Dense layers
 6222            modules_file = self.dir_model / "modules.json"
 6223            if modules_file.is_file():
 6224                with open(modules_file, encoding="utf-8") as modules_json_file:
 6225                    mods = json.load(modules_json_file)
 6226                for mod in mods:
 6227                    if mod["type"] == "sentence_transformers.models.Dense":
 6228                        mod_path = mod["path"]
 6229                        # check if model.safetensors file for Dense layer exists
 6230                        model_tensors_file = self.dir_model / mod_path / "model.safetensors"
 6231                        if model_tensors_file.is_file():
 6232                            self.module_paths.append(mod_path)
 6233                            # read config.json of the Dense layer to get in/out features
 6234                            mod_conf_file = self.dir_model / mod_path / "config.json"
 6235                            if mod_conf_file.is_file():
 6236                                with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file:
 6237                                    mod_conf = json.load(mod_conf_json_file)
 6238                                    # hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights
 6239                                    prefix = self._get_dense_prefix(mod_path)
 6240                                    if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None:
 6241                                        self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"])
 6242
 6243    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 6244        from safetensors.torch import load_file
 6245        module_paths = list(self.module_paths)
 6246        for i, module_path in enumerate(module_paths):
 6247            tensors_file = self.dir_model / module_path / "model.safetensors"
 6248            local_tensors = load_file(tensors_file)
 6249            tensor_name = self._get_dense_prefix(module_path)
 6250            for name, local_tensor in local_tensors.items():
 6251                if not name.endswith(".weight"):
 6252                    continue
 6253                orig_name = name.replace("linear", tensor_name)
 6254                name = self.map_tensor_name(orig_name)
 6255                yield name, local_tensor.clone()
 6256
 6257    @staticmethod
 6258    def _get_dense_prefix(module_path) -> str:
 6259        """Get the tensor name prefix for the Dense layer from module path."""
 6260        tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3"
 6261        return tensor_name
 6262
 6263    def set_gguf_parameters(self):
 6264        super().set_gguf_parameters()
 6265
 6266        # Override the sliding window size as it gets adjusted by the Gemma3TextConfig
 6267        # constructor. We want to use the value from the original model's config.json.
 6268        # ref: https://github.com/huggingface/transformers/pull/40700
 6269        with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
 6270            config = json.load(f)
 6271            orig_sliding_window = config.get("sliding_window")
 6272            if orig_sliding_window is None:
 6273                raise ValueError("sliding_window not found in model config - this is required for the model")
 6274
 6275            logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
 6276                        f"instead of {self.hparams['sliding_window']}")
 6277            self.gguf_writer.add_sliding_window(orig_sliding_window)
 6278        if self.sentence_transformers_dense_modules:
 6279            for dense, dims in self.dense_features_dims.items():
 6280                logger.info(f"Setting dense layer {dense} in/out features to {dims}")
 6281                self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1])
 6282
 6283        self._try_set_pooling_type()
 6284
 6285
 6286@ModelBase.register("Gemma3ForConditionalGeneration")
 6287class Gemma3VisionModel(MmprojModel):
 6288    def set_gguf_parameters(self):
 6289        super().set_gguf_parameters()
 6290        hparams = self.hparams
 6291        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
 6292        # default values below are taken from HF tranformers code
 6293        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
 6294        self.gguf_writer.add_vision_use_gelu(True)
 6295        # calculate proj_scale_factor (used by tinygemma3 test model)
 6296        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
 6297        n_per_side = int(image_seq_length ** 0.5)
 6298        image_size = self.hparams["image_size"]
 6299        patch_size = self.hparams["patch_size"]
 6300        proj_scale_factor = (image_size // patch_size) // n_per_side
 6301        if proj_scale_factor > 0 and proj_scale_factor != 4:
 6302            # we only need to write this if it's not the default value
 6303            # in this case, we are converting a test model
 6304            self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
 6305
 6306    def tensor_force_quant(self, name, new_name, bid, n_dims):
 6307        # related to https://github.com/ggml-org/llama.cpp/issues/13025
 6308        if "input_projection" in name:
 6309            return gguf.GGMLQuantizationType.F16
 6310        if ".embeddings." in name:
 6311            return gguf.GGMLQuantizationType.F32
 6312        return super().tensor_force_quant(name, new_name, bid, n_dims)
 6313
 6314    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6315        if "vision_model.head." in name:
 6316            return # skip redundant tensors for tinygemma3
 6317
 6318        if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
 6319                or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
 6320            # process vision tensors
 6321            name = name.replace("_weight", ".weight")
 6322
 6323            # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
 6324            # the other norm values are part of SigLIP model, and they are already correct
 6325            # ref code: Gemma3RMSNorm
 6326            if "soft_emb_norm.weight" in name:
 6327                logger.info(f"Correcting norm value for '{name}'")
 6328                data_torch = data_torch + 1
 6329
 6330            yield from super().modify_tensors(data_torch, name, bid)
 6331
 6332        return # skip other tensors
 6333
 6334
 6335class ConformerAudioModel(MmprojModel):
 6336    _batch_norm_tensors: list[dict[str, Tensor]] | None = None
 6337
 6338    @staticmethod
 6339    def is_audio_tensor(name: str):
 6340        return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
 6341
 6342    def tensor_force_quant(self, name, new_name, bid, n_dims):
 6343        if ConformerAudioModel.is_audio_tensor(name):
 6344            if ".conv" in name or "_conv" in name and ".weight" in name:
 6345                return gguf.GGMLQuantizationType.F32
 6346        return super().tensor_force_quant(name, new_name, bid, n_dims)
 6347
 6348    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6349        # fold running_mean, running_var and eps into weight and bias for batch_norm
 6350        if "batch_norm" in name:
 6351            if self._batch_norm_tensors is None:
 6352                self._batch_norm_tensors = [{} for _ in range(self.block_count)]
 6353            assert bid is not None
 6354            self._batch_norm_tensors[bid][name] = data_torch
 6355
 6356            if len(self._batch_norm_tensors[bid]) < 5:
 6357                return
 6358
 6359            weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
 6360            bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
 6361            running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
 6362            running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
 6363            eps = 1e-5 # default value
 6364
 6365            a = weight / torch.sqrt(running_var + eps)
 6366            b = bias - running_mean * a
 6367            yield from super().modify_tensors(a, f"conformer.layers.{bid}.conv.batch_norm.weight", bid)
 6368            yield from super().modify_tensors(b, f"conformer.layers.{bid}.conv.batch_norm.bias", bid)
 6369            return
 6370
 6371        # reshape conv weights
 6372        if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
 6373            data_torch = data_torch[:, None, None]
 6374        if "conv.depthwise_conv" in name and name.endswith(".weight"):
 6375            assert data_torch.shape[1] == 1
 6376            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
 6377        if "conv.pointwise_conv" in name and name.endswith(".weight"):
 6378            assert data_torch.shape[2] == 1
 6379            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
 6380
 6381        yield from super().modify_tensors(data_torch, name, bid)
 6382
 6383
 6384@ModelBase.register("Gemma3nForConditionalGeneration")
 6385class Gemma3nVisionAudioModel(ConformerAudioModel):
 6386    has_audio_encoder = True
 6387    has_vision_encoder = True
 6388
 6389    # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
 6390    # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
 6391    block_tensor_mapping = {
 6392        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight":             "v.blk.{bid}.{sid}.conv_exp.weight",
 6393        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight":                  "v.blk.{bid}.{sid}.bn1.weight",
 6394        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight":             "v.blk.{bid}.{sid}.conv_pwl.weight",
 6395        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight":                  "v.blk.{bid}.{sid}.bn2.weight",
 6396        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight":        "v.blk.{bid}.{sid}.dw_start.conv.weight",
 6397        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight":          "v.blk.{bid}.{sid}.dw_start.bn.weight",
 6398        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight":          "v.blk.{bid}.{sid}.dw_mid.conv.weight",
 6399        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight":            "v.blk.{bid}.{sid}.dw_mid.bn.weight",
 6400        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight":          "v.blk.{bid}.{sid}.pw_exp.conv.weight",
 6401        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight":            "v.blk.{bid}.{sid}.pw_exp.bn.weight",
 6402        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight":         "v.blk.{bid}.{sid}.pw_proj.conv.weight",
 6403        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight":           "v.blk.{bid}.{sid}.pw_proj.bn.weight",
 6404        "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma":           "v.blk.{bid}.{sid}.layer_scale.gamma",
 6405        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight":      "v.blk.{bid}.{sid}.attn.query.proj.weight",
 6406        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight":        "v.blk.{bid}.{sid}.attn.key.proj.weight",
 6407        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight":      "v.blk.{bid}.{sid}.attn.value.proj.weight",
 6408        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight":     "v.blk.{bid}.{sid}.attn.output.proj.weight",
 6409        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight":   "v.blk.{bid}.{sid}.attn.key.down_conv.weight",
 6410        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight":        "v.blk.{bid}.{sid}.attn.key.norm.weight",
 6411        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight",
 6412        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight":      "v.blk.{bid}.{sid}.attn.value.norm.weight",
 6413        "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight":                 "v.blk.{bid}.{sid}.norm.weight",
 6414    }
 6415
 6416    def __init__(self, *args, **kwargs):
 6417        # Parent init will call find_hparam which now returns 0 for empty keys
 6418        super().__init__(*args, **kwargs)
 6419        assert self.hparams_vision is not None
 6420        self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it
 6421        self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4
 6422        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
 6423
 6424        # MobileNetV5 does not use image_mean/std
 6425        self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0]
 6426        self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0]
 6427        self.hparams_vision["image_size"] = self.preprocessor_config.get(
 6428            "size", {"height": 768, "width": 768}
 6429        )["height"]
 6430
 6431        # Image sequence length (256 tokens = 16x16 for Gemma3n)
 6432        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
 6433        image_size = self.hparams_vision["image_size"]
 6434        self.hparams_vision["patch_size"] = image_size // image_seq_length
 6435
 6436        # remap audio hparams
 6437        assert self.hparams_audio is not None
 6438        self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"]
 6439        self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"]
 6440        self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"]
 6441        self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144)
 6442
 6443    def set_gguf_parameters(self):
 6444        super().set_gguf_parameters()
 6445
 6446        # vision params
 6447        self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV)
 6448        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
 6449
 6450        # audio params
 6451        assert self.hparams_audio is not None
 6452        self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA)
 6453        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
 6454        self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
 6455
 6456    def tensor_force_quant(self, name, new_name, bid, n_dims):
 6457        # Force quantization settings for specific tensor types
 6458        if "input_projection" in name or "input_proj" in name:
 6459            return gguf.GGMLQuantizationType.F16
 6460        if ".embeddings." in name or "stem" in name:
 6461            return gguf.GGMLQuantizationType.F32
 6462        return super().tensor_force_quant(name, new_name, bid, n_dims)
 6463
 6464    def custom_map(self, name: str) -> str:
 6465        """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping."""
 6466        parts = name.split(".")
 6467        # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix
 6468        if len(parts) >= 7:
 6469            bid, sid = parts[4], parts[5]
 6470            suffix = ".".join(parts[6:])
 6471            template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}"
 6472            if template in self.block_tensor_mapping:
 6473                return self.block_tensor_mapping[template].format(bid=bid, sid=sid)
 6474
 6475        raise ValueError(f"Unknown name: {name}")
 6476
 6477    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6478        if (ConformerAudioModel.is_audio_tensor(name)):
 6479            name = name.replace("model.audio_tower.conformer.", "conformer.layers.")
 6480            yield from super().modify_tensors(data_torch, name, bid)
 6481
 6482        # Gemma3n uses
 6483        # - model.embed_vision.* for projection layers
 6484        # - model.vision_tower.* for vision encoder
 6485        # Skip non-vision tensors
 6486        if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")):
 6487            return
 6488
 6489        if name.startswith("model.vision_tower.timm_model.blocks."):
 6490            # Double-indexed block tensors through custom logic
 6491            yield (self.custom_map(name), data_torch)
 6492            return
 6493        else:
 6494            # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
 6495            new_name = self.map_tensor_name(name)
 6496
 6497        if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
 6498            data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
 6499
 6500        yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
 6501
 6502
 6503@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
 6504class Gemma3NModel(Gemma3Model):
 6505    model_arch = gguf.MODEL_ARCH.GEMMA3N
 6506    norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
 6507
 6508    _altup_proj: list[Tensor] = []
 6509    _altup_unembd: list[Tensor] = []
 6510
 6511    def __init__(self, *args, **kwargs):
 6512        super().__init__(*args, **kwargs)
 6513        assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
 6514        self._altup_proj = [
 6515            torch.Tensor(), # to be replaced
 6516            torch.Tensor(), # to be replaced
 6517            torch.Tensor(), # to be replaced
 6518        ]
 6519        self._altup_unembd = [
 6520            torch.Tensor(), # to be replaced
 6521            torch.Tensor(), # to be replaced
 6522            torch.Tensor(), # to be replaced
 6523        ]
 6524
 6525    def set_vocab(self):
 6526        # For Gemma3n multimodal models, we need the FULL vocab_size (262400)
 6527        # which includes special tokens from 262144-262399 for vision/audio.
 6528        # The vocab_size_per_layer_input (262144) is only the embedding size per layer.
 6529        # Temporarily override the hparams lookup order to prioritize vocab_size.
 6530
 6531        # Store original vocab_size_per_layer_input if it exists
 6532        vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input")
 6533
 6534        # Temporarily remove vocab_size_per_layer_input to force using vocab_size
 6535        if vocab_size_per_layer_input is not None:
 6536            del self.hparams["vocab_size_per_layer_input"]
 6537
 6538        # Call parent set_vocab which will now use vocab_size (262400)
 6539        super().set_vocab()
 6540
 6541        # Restore vocab_size_per_layer_input for later use
 6542        if vocab_size_per_layer_input is not None:
 6543            self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
 6544
 6545    def set_gguf_parameters(self):
 6546        super().set_gguf_parameters()
 6547        self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
 6548        self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
 6549        self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
 6550        self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
 6551
 6552        activation_sparsity_scale = []
 6553        for s in self.hparams["activation_sparsity_pattern"]:
 6554            normal_dist = torch.distributions.normal.Normal(0, 1)
 6555            std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
 6556            activation_sparsity_scale.append(std_multiplier.item())
 6557        self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
 6558
 6559        sliding_window_pattern = []
 6560        for t in self.hparams["layer_types"]:
 6561            sliding_window_pattern.append(t == "sliding_attention")
 6562        self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
 6563
 6564    def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
 6565        has_all = all(m.numel() > 0 for m in matrices)
 6566        if not has_all:
 6567            return None
 6568        else:
 6569            return torch.stack(matrices, dim=0)
 6570
 6571    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6572        if name.endswith("_scale"):
 6573            name = name + ".weight"
 6574
 6575        # TODO: implement self.prediction_coefs.weight.clamp_(...)
 6576
 6577        if "language_model." not in name:
 6578            return # skip non-language model tensors
 6579
 6580        # Pad token embeddings for vision/audio special tokens (262144-262399)
 6581        if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name:
 6582            # Move to CPU to avoid meta device issues during padding
 6583            data_torch = data_torch.to(device="cpu")
 6584
 6585            vocab_size = self.hparams.get("vocab_size", 262400)
 6586            current_size = data_torch.shape[0]  # First dimension is vocab_size
 6587
 6588            if current_size < vocab_size:
 6589                # Pad with zeros for vision/audio tokens (they get embeddings from vision tower)
 6590                padding_size = vocab_size - current_size
 6591                tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings"
 6592                logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)")
 6593
 6594                # Create padding with zeros (vision tokens won't use these embeddings)
 6595                padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device)
 6596                data_torch = torch.cat([data_torch, padding], dim=0)
 6597
 6598            # Continue with normal processing
 6599            name = name.replace("language_model.", "")
 6600            yield from ModelBase.modify_tensors(self, data_torch, name, bid)
 6601            return
 6602
 6603        if "altup_unembed_projections" in name:
 6604            data_torch = data_torch.to(device="cpu")
 6605            # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based
 6606            # They should NOT be padded
 6607            if ".0." in name:
 6608                self._altup_unembd[0] = data_torch
 6609            elif ".1." in name:
 6610                self._altup_unembd[1] = data_torch
 6611            elif ".2." in name:
 6612                self._altup_unembd[2] = data_torch
 6613            else:
 6614                raise ValueError(f"Unknown name: {name}")
 6615            out = self._stack_matrices(self._altup_unembd)
 6616            if out is not None:
 6617                yield from ModelBase.modify_tensors(self, out, "model.altup_unembed_projections.weight", bid)
 6618                return
 6619            else:
 6620                return
 6621
 6622        if "altup_projections" in name:
 6623            data_torch = data_torch.to(device="cpu")
 6624            if ".0." in name:
 6625                self._altup_proj[0] = data_torch
 6626            elif ".1." in name:
 6627                self._altup_proj[1] = data_torch
 6628            elif ".2." in name:
 6629                self._altup_proj[2] = data_torch
 6630            else:
 6631                raise ValueError(f"Unknown name: {name}")
 6632            out = self._stack_matrices(self._altup_proj)
 6633            if out is not None:
 6634                yield from ModelBase.modify_tensors(self, out, "model.altup_projections.weight", bid)
 6635                return
 6636            else:
 6637                return
 6638
 6639        yield from super().modify_tensors(data_torch, name, bid)
 6640
 6641
 6642@ModelBase.register("Starcoder2ForCausalLM")
 6643class StarCoder2Model(TextModel):
 6644    model_arch = gguf.MODEL_ARCH.STARCODER2
 6645
 6646
 6647@ModelBase.register("Rwkv6ForCausalLM")
 6648class Rwkv6Model(TextModel):
 6649    model_arch = gguf.MODEL_ARCH.RWKV6
 6650
 6651    def set_vocab(self):
 6652        self._set_vocab_rwkv_world()
 6653
 6654    def set_gguf_parameters(self):
 6655        head_size = self.hparams["head_size"]
 6656        hidden_size = self.hparams["hidden_size"]
 6657        layer_norm_eps = self.hparams["layer_norm_epsilon"]
 6658        rescale_every_n_layers = self.hparams["rescale_every"]
 6659        intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
 6660        time_mix_extra_dim = 64 if hidden_size == 4096 else 32
 6661        time_decay_extra_dim = 128 if hidden_size == 4096 else 64
 6662
 6663        # RWKV isn't context limited
 6664        self.gguf_writer.add_context_length(1048576)
 6665        self.gguf_writer.add_embedding_length(hidden_size)
 6666        self.gguf_writer.add_block_count(self.block_count)
 6667        self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
 6668        self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
 6669        self.gguf_writer.add_wkv_head_size(head_size)
 6670        self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
 6671        self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
 6672        self.gguf_writer.add_feed_forward_length(intermediate_size)
 6673        self.gguf_writer.add_file_type(self.ftype)
 6674
 6675        # required by llama.cpp, unused
 6676        self.gguf_writer.add_head_count(0)
 6677
 6678    lerp_weights: dict[int, dict[str, Tensor]] = {}
 6679
 6680    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6681        new_name = self.map_tensor_name(name)
 6682
 6683        if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
 6684            new_name += ".weight"
 6685
 6686        if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
 6687            data_torch = data_torch.transpose(0, 1)
 6688
 6689        if new_name.endswith("time_mix_w2.weight"):
 6690            data_torch = data_torch.permute(0, 2, 1)
 6691
 6692        if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
 6693            data_torch = data_torch.squeeze()
 6694
 6695        try:
 6696            rescale_every_n_layers = self.hparams["rescale_every"]
 6697            if rescale_every_n_layers > 0:
 6698                if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
 6699                    data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
 6700        except KeyError:
 6701            pass
 6702
 6703        # concat time_mix_lerp weights to reduce some cpu overhead
 6704        # also reduces the number of tensors in the model
 6705        if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
 6706            try:
 6707                self.lerp_weights[bid][new_name] = data_torch
 6708            except KeyError:
 6709                self.lerp_weights[bid] = {new_name: data_torch}
 6710            if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
 6711                new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
 6712                data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
 6713                yield (new_name, data)
 6714            return
 6715
 6716        yield (new_name, data_torch)
 6717
 6718
 6719@ModelBase.register("RWKV6Qwen2ForCausalLM")
 6720class RWKV6Qwen2Model(Rwkv6Model):
 6721    model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
 6722
 6723    def set_vocab(self):
 6724        try:
 6725            self._set_vocab_sentencepiece()
 6726        except FileNotFoundError:
 6727            self._set_vocab_gpt2()
 6728
 6729    def set_gguf_parameters(self):
 6730        num_attention_heads = self.hparams["num_attention_heads"]
 6731        num_key_value_heads = self.hparams["num_key_value_heads"]
 6732        hidden_size = self.hparams["hidden_size"]
 6733        head_size = hidden_size // num_attention_heads
 6734        rms_norm_eps = self.hparams["rms_norm_eps"]
 6735        intermediate_size = self.hparams["intermediate_size"]
 6736        time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
 6737        time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
 6738
 6739        # RWKV isn't context limited
 6740        self.gguf_writer.add_context_length(1048576)
 6741        self.gguf_writer.add_embedding_length(hidden_size)
 6742        self.gguf_writer.add_block_count(self.block_count)
 6743        self.gguf_writer.add_wkv_head_size(head_size)
 6744        self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
 6745        self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
 6746        self.gguf_writer.add_feed_forward_length(intermediate_size)
 6747        self.gguf_writer.add_file_type(self.ftype)
 6748
 6749        # special parameters for time_mixing in RWKV6QWEN2
 6750        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
 6751        self.gguf_writer.add_token_shift_count(1)
 6752        # RWKV6QWEN2 use grouped key/value like GQA
 6753        self.gguf_writer.add_head_count_kv(num_key_value_heads)
 6754
 6755        # required by llama.cpp, unused
 6756        self.gguf_writer.add_head_count(0)
 6757
 6758    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6759        for new_name, data in super().modify_tensors(data_torch, name, bid):
 6760            if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
 6761                data = data.view(5, -1, data.shape[-1])
 6762                # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
 6763                # permute them here to avoid code changes
 6764                data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
 6765                if "w2" in new_name:
 6766                    data = data.view(5, -1, data.shape[-1])
 6767                yield (new_name, data)
 6768                continue
 6769            yield (new_name, data)
 6770
 6771
 6772@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM")
 6773class Rwkv7Model(TextModel):
 6774    model_arch = gguf.MODEL_ARCH.RWKV7
 6775
 6776    def set_vocab(self):
 6777        self._set_vocab_rwkv_world()
 6778
 6779    def calc_lora_rank(self, hidden_size, exponent, multiplier):
 6780        return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32
 6781
 6782    def set_gguf_parameters(self):
 6783        try:
 6784            head_size = self.hparams["head_size"]
 6785            layer_norm_eps = self.hparams["layer_norm_epsilon"]
 6786        except KeyError:
 6787            head_size = self.hparams["head_dim"]
 6788            layer_norm_eps = self.hparams["norm_eps"]
 6789        hidden_size = self.hparams["hidden_size"]
 6790        intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4)
 6791
 6792        # ICLR: In-Context-Learning-Rate
 6793        try:
 6794            lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
 6795            lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
 6796            lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
 6797            lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
 6798        except KeyError:
 6799            lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
 6800            lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
 6801            lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
 6802            lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
 6803
 6804        # RWKV isn't context limited
 6805        self.gguf_writer.add_context_length(1048576)
 6806        self.gguf_writer.add_embedding_length(hidden_size)
 6807        self.gguf_writer.add_block_count(self.block_count)
 6808        self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
 6809        self.gguf_writer.add_wkv_head_size(head_size)
 6810        self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
 6811        self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
 6812        self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
 6813        self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
 6814        self.gguf_writer.add_feed_forward_length(intermediate_size)
 6815        self.gguf_writer.add_file_type(self.ftype)
 6816
 6817        # required by llama.cpp, unused
 6818        self.gguf_writer.add_head_count(0)
 6819
 6820    lerp_weights: dict[int, dict[str, Tensor]] = {}
 6821    lora_needs_transpose: bool = True
 6822
 6823    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 6824        # unify tensor names here to make life easier
 6825        name = name.replace("blocks", "layers").replace("ffn", "feed_forward")
 6826        name = name.replace("self_attn", "attention").replace("attn", "attention")
 6827        name = name.replace("time_mixer.", "")
 6828        # lora layer names in fla-hub's impl
 6829        if "_lora.lora" in name:
 6830            self.lora_needs_transpose = False
 6831        name = name.replace("_lora.lora.0.weight", "1.weight")
 6832        name = name.replace("_lora.lora.2.weight", "2.weight")
 6833        name = name.replace("_lora.lora.2.bias", "0.weight")
 6834
 6835        name = name.replace("feed_forward_norm", "ln2")
 6836        name = name.replace("g_norm", "ln_x")
 6837
 6838        if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0:
 6839            # some models have dummy v0/v1/v2 on first layer while others don't
 6840            # ignore them all since they are not used
 6841            return
 6842
 6843        wkv_has_gate = self.hparams.get("wkv_has_gate", True)
 6844        lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"]
 6845
 6846        if bid is not None and "attention.x_" in name:
 6847            if "attention.x_x" in name:
 6848                # already concatenated
 6849                new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
 6850                data = data_torch.reshape(len(lerp_list), 1, 1, -1)
 6851                yield (new_name, data)
 6852            else:
 6853                try:
 6854                    self.lerp_weights[bid][name] = data_torch
 6855                except KeyError:
 6856                    self.lerp_weights[bid] = {name: data_torch}
 6857                if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list):
 6858                    new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
 6859                    data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0)
 6860                    yield (new_name, data)
 6861            return
 6862        else:
 6863            data_torch = data_torch.squeeze()
 6864            new_name = self.map_tensor_name(name)
 6865
 6866            if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
 6867                new_name += ".weight"
 6868
 6869            if self.lora_needs_transpose and any(
 6870                new_name.endswith(t) for t in [
 6871                    "time_mix_w1.weight", "time_mix_w2.weight",
 6872                    "time_mix_a1.weight", "time_mix_a2.weight",
 6873                    "time_mix_v1.weight", "time_mix_v2.weight",
 6874                    "time_mix_g1.weight", "time_mix_g2.weight",
 6875                ]
 6876            ):
 6877                data_torch = data_torch.transpose(0, 1)
 6878
 6879            if 'r_k' in new_name:
 6880                data_torch = data_torch.flatten()
 6881
 6882            if bid == 0 and "time_mix_a" in new_name:
 6883                # dummy v0/v1/v2 on first layer
 6884                # easist way to make llama happy
 6885                yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)
 6886
 6887            yield (new_name, data_torch)
 6888
 6889
 6890@ModelBase.register("RwkvHybridForCausalLM")
 6891class ARwkv7Model(Rwkv7Model):
 6892    model_arch = gguf.MODEL_ARCH.ARWKV7
 6893
 6894    def set_vocab(self):
 6895        try:
 6896            self._set_vocab_sentencepiece()
 6897        except FileNotFoundError:
 6898            self._set_vocab_gpt2()
 6899
 6900    def set_gguf_parameters(self):
 6901        hidden_size = self.hparams["hidden_size"]
 6902        head_size = self.hparams["head_size"]
 6903        rms_norm_eps = self.hparams["rms_norm_eps"]
 6904        intermediate_size = self.hparams["intermediate_size"]
 6905        wkv_has_gate = self.hparams["wkv_has_gate"]
 6906        assert self.hparams["wkv_version"] == 7
 6907
 6908        # ICLR: In-Context-Learning-Rate
 6909        lora_rank_decay = 64
 6910        lora_rank_iclr = 64
 6911        lora_rank_value_residual_mix = 32
 6912        lora_rank_gate = 128 if wkv_has_gate else 0
 6913
 6914        # RWKV isn't context limited
 6915        self.gguf_writer.add_context_length(1048576)
 6916        self.gguf_writer.add_embedding_length(hidden_size)
 6917        self.gguf_writer.add_block_count(self.block_count)
 6918        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
 6919        self.gguf_writer.add_wkv_head_size(head_size)
 6920        self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
 6921        self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
 6922        self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
 6923        self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
 6924        self.gguf_writer.add_feed_forward_length(intermediate_size)
 6925        self.gguf_writer.add_file_type(self.ftype)
 6926        self.gguf_writer.add_token_shift_count(1)
 6927
 6928        # required by llama.cpp, unused
 6929        self.gguf_writer.add_head_count(0)
 6930
 6931
 6932@ModelBase.register("MaincoderForCausalLM")
 6933class MaincoderModel(TextModel):
 6934    model_arch = gguf.MODEL_ARCH.MAINCODER
 6935
 6936    def set_gguf_parameters(self):
 6937        super().set_gguf_parameters()
 6938
 6939        if (head_dim := self.hparams.get("head_dim")) is not None:
 6940            self.gguf_writer.add_rope_dimension_count(head_dim)
 6941
 6942
 6943@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
 6944class MambaModel(TextModel):
 6945    model_arch = gguf.MODEL_ARCH.MAMBA
 6946
 6947    def __init__(self, dir_model: Path, *args, **kwargs):
 6948        # Avoid using AutoConfig for hparams
 6949        hparams = kwargs.pop("hparams", None)
 6950        if hparams is None:
 6951            with open(dir_model / "config.json", "r", encoding="utf-8") as f:
 6952                hparams = json.load(f)
 6953        super().__init__(dir_model, *args, hparams=hparams, **kwargs)
 6954
 6955    def set_vocab(self):
 6956        vocab_size = self.hparams["vocab_size"]
 6957        # Round vocab size to next multiple of 8
 6958        pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
 6959        # pad using ceiling division
 6960        # ref: https://stackoverflow.com/a/17511341/22827863
 6961        vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
 6962        self.hparams["vocab_size"] = vocab_size
 6963
 6964        if (self.dir_model / "tokenizer.json").is_file():
 6965            self._set_vocab_gpt2()
 6966        elif (self.dir_model / "tokenizer.model").is_file():
 6967            self._set_vocab_sentencepiece()
 6968        else:
 6969            # Use the GPT-NeoX tokenizer when no tokenizer files are present
 6970            self._set_vocab_builtin("gpt-neox", vocab_size)
 6971
 6972    def set_gguf_parameters(self):
 6973        d_model = self.find_hparam(["hidden_size",       "d_model"])
 6974        d_conv  = self.find_hparam(["conv_kernel",       "d_conv"],  optional=True) or 4
 6975        d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
 6976        d_state = self.find_hparam(["state_size",        "d_state"], optional=True) or 16
 6977        # ceiling division
 6978        # ref: https://stackoverflow.com/a/17511341/22827863
 6979        # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
 6980        dt_rank      = self.find_hparam(["time_step_rank",     "dt_rank"],      optional=True) or -(d_model // -16)
 6981        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
 6982        use_dt_b_c_norm = False
 6983        # For falconmamba we do apply RMS norm on B / DT and C layers
 6984        if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
 6985            use_dt_b_c_norm = True
 6986        # Fail early for models which don't have a block expansion factor of 2
 6987        assert d_inner == 2 * d_model
 6988
 6989        self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
 6990        self.gguf_writer.add_embedding_length(d_model)
 6991        self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
 6992        self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
 6993        self.gguf_writer.add_block_count(self.block_count)
 6994        self.gguf_writer.add_ssm_conv_kernel(d_conv)
 6995        self.gguf_writer.add_ssm_inner_size(d_inner)
 6996        self.gguf_writer.add_ssm_state_size(d_state)
 6997        self.gguf_writer.add_ssm_time_step_rank(dt_rank)
 6998        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
 6999        self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
 7000        self.gguf_writer.add_file_type(self.ftype)
 7001
 7002    _tok_embd = None
 7003
 7004    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 7005        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
 7006        tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
 7007
 7008        new_name = self.map_tensor_name(name)
 7009
 7010        if name.endswith(".A_log"):
 7011            logger.debug("A_log --> A ==> " + new_name)
 7012            data_torch = -torch.exp(data_torch)
 7013
 7014        # [4 1 8192 1] -> [4 8192 1 1]
 7015        if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
 7016            data_torch = data_torch.squeeze()
 7017
 7018        # assuming token_embd.weight is seen before output.weight
 7019        if self._tok_embd is not None and new_name == output_name:
 7020            if torch.equal(self._tok_embd, data_torch):
 7021                logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
 7022                return
 7023        elif new_name == tok_embd_name:
 7024            self._tok_embd = data_torch
 7025
 7026        yield from super().modify_tensors(data_torch, new_name, bid)
 7027
 7028
 7029@ModelBase.register("Mamba2ForCausalLM")
 7030class Mamba2Model(TextModel):
 7031    model_arch = gguf.MODEL_ARCH.MAMBA2
 7032
 7033    def __init__(self, dir_model: Path, *args, **kwargs):
 7034        # Avoid using AutoConfig for hparams
 7035        # It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
 7036        hparams = kwargs.pop("hparams", None)
 7037        if hparams is None:
 7038            with open(dir_model / "config.json", "r", encoding="utf-8") as f:
 7039                hparams = json.load(f)
 7040        super().__init__(dir_model, *args, hparams=hparams, **kwargs)
 7041        self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
 7042        self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
 7043        self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
 7044
 7045    def set_vocab(self):
 7046        vocab_size = self.hparams["vocab_size"]
 7047        # Round vocab size to next multiple of 16
 7048        pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
 7049        # pad using ceiling division
 7050        # ref: https://stackoverflow.com/a/17511341/22827863
 7051        vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
 7052        self.hparams["vocab_size"] = vocab_size
 7053
 7054        if (self.dir_model / "tokenizer.model").is_file():
 7055            self._set_vocab_sentencepiece()
 7056        elif (self.dir_model / "tokenizer.model.v3").is_file():
 7057            # mamba-codestral
 7058            raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
 7059        elif (self.dir_model / "tokenizer.json").is_file():
 7060            self._set_vocab_gpt2()
 7061        else:
 7062            # Use the GPT-NeoX tokenizer when no tokenizer files are present
 7063            self._set_vocab_builtin("gpt-neox", vocab_size)
 7064
 7065    def set_gguf_parameters(self):
 7066        d_conv  = self.find_hparam(["conv_kernel", "d_conv"],     optional=True) or 4
 7067        d_state = self.find_hparam(["state_size",  "d_state"],    optional=True) or 128
 7068        head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
 7069
 7070        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
 7071
 7072        # Fail early for models which don't have a block expansion factor of 2
 7073        # TODO: does this really matter?
 7074        # skip the assertion for FalconH1 Model
 7075        if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
 7076            assert self.d_inner == 2 * self.d_model
 7077            assert self.d_inner % head_dim == 0
 7078
 7079        self.gguf_writer.add_context_length(2**20)  # arbitrary value; for those who use the default
 7080        self.gguf_writer.add_embedding_length(self.d_model)
 7081        self.gguf_writer.add_feed_forward_length(0)  # unused, but seemingly required when loading
 7082        self.gguf_writer.add_head_count(0)  # unused, but seemingly required when loading
 7083        self.gguf_writer.add_block_count(self.block_count)
 7084        self.gguf_writer.add_ssm_conv_kernel(d_conv)
 7085        self.gguf_writer.add_ssm_inner_size(self.d_inner)
 7086        self.gguf_writer.add_ssm_state_size(d_state)
 7087        self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
 7088        self.gguf_writer.add_ssm_group_count(self.n_group)
 7089        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
 7090        self.gguf_writer.add_file_type(self.ftype)
 7091
 7092    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 7093
 7094        if name.startswith("model.backbone") or name.startswith("model.lm_head"):
 7095            # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
 7096            name = name.removeprefix("model.")
 7097
 7098        if name.endswith(".dt_bias"):
 7099            name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
 7100
 7101        new_name = self.map_tensor_name(name)
 7102
 7103        if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
 7104            data_torch = data_torch.squeeze()
 7105        elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
 7106            gguf.MODEL_TENSOR.SSM_A,
 7107            gguf.MODEL_TENSOR.SSM_D,
 7108        ]):
 7109            # unsqueeze A to use similar shape semantics as Mamba-1
 7110            # (D is also unsqueezed, but for more straightforward broadcast internally)
 7111            data_torch = data_torch.reshape((*data_torch.shape, 1))
 7112        elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
 7113            data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
 7114
 7115        if name.endswith(".A_log"):
 7116            logger.debug("A_log --> A ==> " + new_name)
 7117            data_torch = -torch.exp(data_torch)
 7118
 7119        yield (new_name, data_torch)
 7120
 7121
 7122@ModelBase.register("JambaForCausalLM")
 7123class JambaModel(TextModel):
 7124    model_arch = gguf.MODEL_ARCH.JAMBA
 7125
 7126    def set_vocab(self):
 7127        if (self.dir_model / "tokenizer.model").is_file():
 7128            self._set_vocab_sentencepiece()
 7129        else:
 7130            self._set_vocab_llama_hf()
 7131            self.gguf_writer.add_add_space_prefix(False)
 7132
 7133    def set_gguf_parameters(self):
 7134        d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
 7135        d_conv  = self.find_hparam(["mamba_d_conv"],  optional=True) or 4
 7136        d_inner = self.hparams["mamba_expand"] * d_model
 7137        d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
 7138        # ceiling division
 7139        # ref: https://stackoverflow.com/a/17511341/22827863
 7140        # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
 7141        dt_rank      = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
 7142        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
 7143        n_kv_head = self.hparams["num_key_value_heads"]
 7144        attn_offset = self.hparams["attn_layer_offset"]
 7145        attn_period = self.hparams["attn_layer_period"]
 7146        n_kv_vec = [0 for _ in range(attn_offset)] + [
 7147            n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
 7148        ]
 7149
 7150        self.gguf_writer.add_block_count(self.block_count)
 7151        self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
 7152        self.gguf_writer.add_embedding_length(d_model)
 7153        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
 7154        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
 7155        self.gguf_writer.add_head_count_kv(n_kv_vec)
 7156        self.gguf_writer.add_ssm_conv_kernel(d_conv)
 7157        self.gguf_writer.add_ssm_inner_size(d_inner)
 7158        self.gguf_writer.add_ssm_state_size(d_state)
 7159        self.gguf_writer.add_ssm_time_step_rank(dt_rank)
 7160        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
 7161        self.gguf_writer.add_expert_count(self.hparams["num_experts"])
 7162        self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
 7163        self.gguf_writer.add_file_type(self.ftype)
 7164
 7165    _experts: list[dict[str, Tensor]] | None = None
 7166
 7167    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 7168
 7169        # Mini-Jamba
 7170        name = name.replace(".moe.", ".feed_forward.")
 7171        if bid is not None:
 7172            moe_offset = self.hparams["expert_layer_offset"]
 7173            moe_period = self.hparams["expert_layer_period"]
 7174
 7175            if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
 7176                name = name.replace(".experts.0.", ".")
 7177
 7178        # process the experts separately
 7179        if ".feed_forward.experts." in name:
 7180            n_experts = self.hparams["num_experts"]
 7181
 7182            assert bid is not None
 7183
 7184            if self._experts is None:
 7185                self._experts = [{} for _ in range(self.block_count)]
 7186
 7187            self._experts[bid][name] = data_torch
 7188
 7189            if len(self._experts[bid]) >= n_experts * 3:
 7190
 7191                # merge the experts into a single 3d tensor
 7192                for wid in ["down_proj", "gate_proj", "up_proj"]:
 7193                    datas: list[Tensor] = []
 7194
 7195                    for xid in range(n_experts):
 7196                        ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
 7197                        datas.append(self._experts[bid][ename])
 7198                        del self._experts[bid][ename]
 7199
 7200                    data_torch = torch.stack(datas, dim=0)
 7201
 7202                    # using the same merged name as qwen2moe
 7203                    merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
 7204
 7205                    new_name = self.map_tensor_name(merged_name)
 7206
 7207                    yield new_name, data_torch
 7208            return
 7209
 7210        new_name = self.map_tensor_name(name)
 7211
 7212        if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
 7213            data_torch = data_torch.squeeze()
 7214
 7215        if name.endswith(".A_log"):
 7216            logger.debug("A_log --> A ==> " + new_name)
 7217            data_torch = -torch.exp(data_torch)
 7218
 7219        yield (new_name, data_torch)
 7220
 7221    def prepare_tensors(self):
 7222        super().prepare_tensors()
 7223
 7224        if self._experts is not None:
 7225            # flatten `list[dict[str, Tensor]]` into `list[str]`
 7226            experts = [k for d in self._experts for k in d.keys()]
 7227            if len(experts) > 0:
 7228                raise ValueError(f"Unprocessed experts: {experts}")
 7229
 7230
 7231@ModelBase.register("CohereForCausalLM")
 7232class CommandR2Model(TextModel):
 7233    model_arch = gguf.MODEL_ARCH.COMMAND_R
 7234
 7235    def __init__(self, *args, **kwargs):
 7236        super().__init__(*args, **kwargs)
 7237
 7238        # max_position_embeddings = 8192 in config.json but model was actually
 7239        # trained on 128k context length
 7240        # aya-23 models don't have model_max_length specified
 7241        self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
 7242
 7243    def set_gguf_parameters(self):
 7244        super().set_gguf_parameters()
 7245        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
 7246        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
 7247
 7248
 7249@ModelBase.register("Cohere2ForCausalLM")
 7250class Cohere2Model(TextModel):
 7251    model_arch = gguf.MODEL_ARCH.COHERE2
 7252
 7253    def set_gguf_parameters(self):
 7254        super().set_gguf_parameters()
 7255
 7256        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
 7257        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
 7258        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
 7259
 7260        rotary_pct = self.hparams["rotary_pct"]
 7261        hidden_size = self.hparams["hidden_size"]
 7262        num_attention_heads = self.hparams["num_attention_heads"]
 7263        self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
 7264        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
 7265
 7266
 7267@ModelBase.register("OlmoForCausalLM")
 7268@ModelBase.register("OLMoForCausalLM")
 7269class OlmoModel(TextModel):
 7270    model_arch = gguf.MODEL_ARCH.OLMO
 7271
 7272    def set_gguf_parameters(self):
 7273        super().set_gguf_parameters()
 7274        self.gguf_writer.add_layer_norm_eps(1e-5)
 7275        clip_qkv = self.hparams.get("clip_qkv")
 7276        if clip_qkv is not None:
 7277            self.gguf_writer.add_clamp_kqv(clip_qkv)
 7278
 7279    # Same as super class, but permuting q_proj, k_proj
 7280    # Copied from: LlamaModel
 7281    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 7282        n_head = self.hparams["num_attention_heads"]
 7283        n_kv_head = self.hparams.get("num_key_value_heads")
 7284
 7285        if name.endswith("q_proj.weight"):
 7286            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
 7287        if name.endswith("k_proj.weight"):
 7288            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 7289
 7290        yield from super().modify_tensors(data_torch, name, bid)
 7291
 7292
 7293@ModelBase.register("SeedOssForCausalLM")
 7294class SeedOssModel(TextModel):
 7295    model_arch = gguf.MODEL_ARCH.SEED_OSS
 7296
 7297
 7298@ModelBase.register("Olmo2ForCausalLM")
 7299@ModelBase.register("Olmo3ForCausalLM")
 7300class Olmo2Model(TextModel):
 7301    model_arch = gguf.MODEL_ARCH.OLMO2
 7302
 7303    def set_gguf_parameters(self):
 7304        super().set_gguf_parameters()
 7305
 7306        if "sliding_window" in self.hparams:
 7307            self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
 7308
 7309            sliding_window_pattern = []
 7310            if "layer_types" in self.hparams:
 7311                sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
 7312            else:
 7313                # Olmo2 does not use sliding window attention.
 7314                # Olmo3 defaults to using sliding window for all layers except every 4th.
 7315                for i in range(self.hparams["num_hidden_layers"]):
 7316                    sliding_window_pattern.append((i + 1) % 4 != 0)
 7317
 7318            self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
 7319
 7320
 7321@ModelBase.register("OlmoeForCausalLM")
 7322class OlmoeModel(TextModel):
 7323    model_arch = gguf.MODEL_ARCH.OLMOE
 7324
 7325    def set_gguf_parameters(self):
 7326        super().set_gguf_parameters()
 7327        self.gguf_writer.add_layer_norm_rms_eps(1e-5)
 7328        if (n_experts := self.hparams.get("num_experts")) is not None:
 7329            self.gguf_writer.add_expert_count(n_experts)
 7330
 7331    _experts: list[dict[str, Tensor]] | None = None
 7332
 7333    # Copied from: Qwen2MoeModel
 7334    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 7335        # process the experts separately
 7336        if name.find("experts") != -1:
 7337            n_experts = self.hparams["num_experts"]
 7338            assert bid is not None
 7339
 7340            if self._experts is None:
 7341                self._experts = [{} for _ in range(self.block_count)]
 7342
 7343            self._experts[bid][name] = data_torch
 7344
 7345            if len(self._experts[bid]) >= n_experts * 3:
 7346                # merge the experts into a single 3d tensor
 7347                for w_name in ["down_proj", "gate_proj", "up_proj"]:
 7348                    datas: list[Tensor] = []
 7349
 7350                    for xid in range(n_experts):
 7351                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 7352                        datas.append(self._experts[bid][ename])
 7353                        del self._experts[bid][ename]
 7354
 7355                    data_torch = torch.stack(datas, dim=0)
 7356
 7357                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 7358
 7359                    yield from super().modify_tensors(data_torch, merged_name, bid)
 7360                return
 7361            else:
 7362                return
 7363
 7364        yield from super().modify_tensors(data_torch, name, bid)
 7365
 7366    # Copied from: Qwen2MoeModel
 7367    def prepare_tensors(self):
 7368        super().prepare_tensors()
 7369
 7370        if self._experts is not None:
 7371            # flatten `list[dict[str, Tensor]]` into `list[str]`
 7372            experts = [k for d in self._experts for k in d.keys()]
 7373            if len(experts) > 0:
 7374                raise ValueError(f"Unprocessed experts: {experts}")
 7375
 7376
 7377@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM")
 7378class JinaBertV2Model(BertModel):
 7379    model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
 7380
 7381    def set_vocab(self):
 7382        tokenizer_class = 'BertTokenizer'
 7383        with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
 7384            tokenizer_class = json.load(f)['tokenizer_class']
 7385
 7386        if tokenizer_class == 'BertTokenizer':
 7387            super().set_vocab()
 7388        elif tokenizer_class == 'RobertaTokenizer':
 7389            self._set_vocab_gpt2()
 7390            self.gguf_writer.add_token_type_count(2)
 7391        else:
 7392            raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
 7393
 7394
 7395@ModelBase.register("OpenELMForCausalLM")
 7396class OpenELMModel(TextModel):
 7397    model_arch = gguf.MODEL_ARCH.OPENELM
 7398
 7399    @staticmethod
 7400    def _make_divisible(v: float | int, divisor: int) -> int:
 7401        # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
 7402        new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
 7403        # Make sure that round down does not go down by more than 10%.
 7404        if new_v < 0.9 * v:
 7405            new_v += divisor
 7406        return new_v
 7407
 7408    def __init__(self, *args, **kwargs):
 7409        super().__init__(*args, **kwargs)
 7410
 7411        ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
 7412        ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
 7413        self._n_embd: int = self.hparams["model_dim"]
 7414        self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
 7415        self._num_query_heads: list[int] = self.hparams["num_query_heads"]
 7416        self._ffn_dims: list[int] = [
 7417            OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
 7418            for multiplier in ffn_multipliers
 7419        ]
 7420        assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
 7421        assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
 7422
 7423    # Uses the tokenizer from meta-llama/Llama-2-7b-hf
 7424    def set_vocab(self):
 7425        try:
 7426            self._set_vocab_sentencepiece()
 7427        except FileNotFoundError:
 7428            self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
 7429
 7430    def set_gguf_parameters(self):
 7431        n_embd = self._n_embd
 7432        head_dim = self.hparams["head_dim"]
 7433        rot_pct = 1.0
 7434        assert self.block_count == len(self._num_kv_heads)
 7435        assert self.block_count == len(self._num_query_heads)
 7436        assert self.block_count == len(self._ffn_dims)
 7437
 7438        self.gguf_writer.add_block_count(self.block_count)
 7439        self.gguf_writer.add_context_length(self.hparams["max_context_length"])
 7440        self.gguf_writer.add_embedding_length(n_embd)
 7441        self.gguf_writer.add_feed_forward_length(self._ffn_dims)
 7442        self.gguf_writer.add_head_count(self._num_query_heads)
 7443        self.gguf_writer.add_head_count_kv(self._num_kv_heads)
 7444        self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
 7445        # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
 7446        self.gguf_writer.add_layer_norm_rms_eps(1e-6)
 7447        self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
 7448        self.gguf_writer.add_key_length(head_dim)
 7449        self.gguf_writer.add_value_length(head_dim)
 7450        self.gguf_writer.add_file_type(self.ftype)
 7451
 7452    def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
 7453        if "n_layers" in keys:
 7454            return self.hparams["num_transformer_layers"]
 7455
 7456        return super().find_hparam(keys, optional)
 7457
 7458    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 7459
 7460        # split ff
 7461        if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
 7462            ff_dim = self._ffn_dims[bid]
 7463            yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
 7464            yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
 7465            return
 7466
 7467        yield (self.map_tensor_name(name), data_torch)
 7468
 7469
 7470@ModelBase.register("ArcticForCausalLM")
 7471class ArcticModel(TextModel):
 7472    model_arch = gguf.MODEL_ARCH.ARCTIC
 7473
 7474    def set_vocab(self):
 7475        # The reason for using a custom implementation here is that the
 7476        # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
 7477        # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
 7478        from sentencepiece import SentencePieceProcessor
 7479
 7480        tokenizer_path = self.dir_model / 'tokenizer.model'
 7481
 7482        if not tokenizer_path.is_file():
 7483            logger.error(f'Error: Missing {tokenizer_path}')
 7484            sys.exit(1)
 7485
 7486        # Read the whole vocabulary from the tokenizer.model file
 7487        tokenizer = SentencePieceProcessor()
 7488        tokenizer.LoadFromFile(str(tokenizer_path))
 7489
 7490        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
 7491
 7492        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
 7493        scores: list[float] = [-10000.0] * vocab_size
 7494        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
 7495
 7496        for token_id in range(tokenizer.vocab_size()):
 7497
 7498            piece = tokenizer.IdToPiece(token_id)
 7499            text = piece.encode("utf-8")
 7500            score = tokenizer.GetScore(token_id)
 7501
 7502            toktype = SentencePieceTokenTypes.NORMAL
 7503            if tokenizer.IsUnknown(token_id):
 7504                toktype = SentencePieceTokenTypes.UNKNOWN
 7505            elif tokenizer.IsControl(token_id):
 7506                toktype = SentencePieceTokenTypes.CONTROL
 7507            elif tokenizer.IsUnused(token_id):
 7508                toktype = SentencePieceTokenTypes.UNUSED
 7509            elif tokenizer.IsByte(token_id):
 7510                toktype = SentencePieceTokenTypes.BYTE
 7511
 7512            tokens[token_id] = text
 7513            scores[token_id] = score
 7514            toktypes[token_id] = toktype
 7515
 7516        # Use the added_tokens_decoder field from tokeniser_config.json as the source
 7517        # of information about added/redefined tokens and modify them accordingly.
 7518        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
 7519        if tokenizer_config_file.is_file():
 7520            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
 7521                tokenizer_config_json = json.load(f)
 7522
 7523                if "added_tokens_decoder" in tokenizer_config_json:
 7524                    added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
 7525                    for token_id, token_json in added_tokens_decoder.items():
 7526                        token_id = int(token_id)
 7527                        if token_id >= vocab_size:
 7528                            logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
 7529                            continue
 7530
 7531                        token_content = token_json["content"]
 7532                        token_type = SentencePieceTokenTypes.USER_DEFINED
 7533                        token_score = -10000.0
 7534
 7535                        # Map unk_token to UNKNOWN, other special tokens to CONTROL
 7536                        # Set the score to 0.0 as in the original tokenizer.model
 7537                        if ("special" in token_json) and token_json["special"]:
 7538                            if token_content == tokenizer_config_json["unk_token"]:
 7539                                token_type = SentencePieceTokenTypes.UNKNOWN
 7540                            else:
 7541                                token_type = SentencePieceTokenTypes.CONTROL
 7542                            token_score = 0.0
 7543
 7544                        logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
 7545                        tokens[token_id] = token_content.encode("utf-8")
 7546                        toktypes[token_id] = token_type
 7547                        scores[token_id] = token_score
 7548
 7549        self.gguf_writer.add_tokenizer_model("llama")
 7550        self.gguf_writer.add_tokenizer_pre("default")
 7551        self.gguf_writer.add_token_list(tokens)
 7552        self.gguf_writer.add_token_scores(scores)
 7553        self.gguf_writer.add_token_types(toktypes)
 7554
 7555        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 7556        special_vocab.add_to_gguf(self.gguf_writer)
 7557
 7558    def set_gguf_parameters(self):
 7559        super().set_gguf_parameters()
 7560        hparams = self.hparams
 7561        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 7562        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
 7563
 7564    _experts: list[dict[str, Tensor]] | None = None
 7565
 7566    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 7567        n_head = self.hparams["num_attention_heads"]
 7568        n_kv_head = self.hparams.get("num_key_value_heads")
 7569
 7570        if name.endswith("q_proj.weight"):
 7571            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
 7572        if name.endswith("k_proj.weight"):
 7573            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 7574
 7575        # process the experts separately
 7576        if name.find("block_sparse_moe.experts") != -1:
 7577            n_experts = self.hparams["num_local_experts"]
 7578
 7579            assert bid is not None
 7580
 7581            if self._experts is None:
 7582                self._experts = [{} for _ in range(self.block_count)]
 7583
 7584            self._experts[bid][name] = data_torch
 7585
 7586            if len(self._experts[bid]) >= n_experts * 3:
 7587                # merge the experts into a single 3d tensor
 7588                for wid in ["w1", "w2", "w3"]:
 7589                    datas: list[Tensor] = []
 7590
 7591                    for xid in range(n_experts):
 7592                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
 7593                        datas.append(self._experts[bid][ename])
 7594                        del self._experts[bid][ename]
 7595
 7596                    data_torch = torch.stack(datas, dim=0)
 7597
 7598                    merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
 7599
 7600                    yield from super().modify_tensors(data_torch, merged_name, bid)
 7601                return
 7602            else:
 7603                return
 7604
 7605        yield from super().modify_tensors(data_torch, name, bid)
 7606
 7607    def prepare_tensors(self):
 7608        super().prepare_tensors()
 7609
 7610        if self._experts is not None:
 7611            # flatten `list[dict[str, Tensor]]` into `list[str]`
 7612            experts = [k for d in self._experts for k in d.keys()]
 7613            if len(experts) > 0:
 7614                raise ValueError(f"Unprocessed experts: {experts}")
 7615
 7616
 7617@ModelBase.register("DeepseekForCausalLM")
 7618class DeepseekModel(TextModel):
 7619    model_arch = gguf.MODEL_ARCH.DEEPSEEK
 7620
 7621    def set_vocab(self):
 7622        try:
 7623            self._set_vocab_sentencepiece()
 7624        except FileNotFoundError:
 7625            self._set_vocab_gpt2()
 7626
 7627    def set_gguf_parameters(self):
 7628        super().set_gguf_parameters()
 7629        hparams = self.hparams
 7630        if (rope_dim := hparams.get("head_dim")) is None:
 7631            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
 7632
 7633        self.gguf_writer.add_rope_dimension_count(rope_dim)
 7634        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
 7635        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
 7636        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 7637        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
 7638        self.gguf_writer.add_expert_weights_scale(1.0)
 7639        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
 7640        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
 7641
 7642    _experts: list[dict[str, Tensor]] | None = None
 7643
 7644    @staticmethod
 7645    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
 7646        if n_head_kv is not None and n_head != n_head_kv:
 7647            n_head = n_head_kv
 7648        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
 7649                .swapaxes(1, 2)
 7650                .reshape(weights.shape))
 7651
 7652    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 7653        n_head = self.hparams["num_attention_heads"]
 7654        n_kv_head = self.hparams.get("num_key_value_heads")
 7655
 7656        if name.endswith(("q_proj.weight", "q_proj.bias")):
 7657            data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
 7658        if name.endswith(("k_proj.weight", "k_proj.bias")):
 7659            data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
 7660
 7661        # process the experts separately
 7662        if name.find("mlp.experts") != -1:
 7663            n_experts = self.hparams["n_routed_experts"]
 7664            assert bid is not None
 7665
 7666            if self._experts is None:
 7667                self._experts = [{} for _ in range(self.block_count)]
 7668
 7669            self._experts[bid][name] = data_torch
 7670
 7671            if len(self._experts[bid]) >= n_experts * 3:
 7672                # merge the experts into a single 3d tensor
 7673                for w_name in ["down_proj", "gate_proj", "up_proj"]:
 7674                    datas: list[Tensor] = []
 7675
 7676                    for xid in range(n_experts):
 7677                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 7678                        datas.append(self._experts[bid][ename])
 7679                        del self._experts[bid][ename]
 7680
 7681                    data_torch = torch.stack(datas, dim=0)
 7682
 7683                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 7684
 7685                    yield from super().modify_tensors(data_torch, merged_name, bid)
 7686                return
 7687            else:
 7688                return
 7689
 7690        yield from super().modify_tensors(data_torch, name, bid)
 7691
 7692    def prepare_tensors(self):
 7693        super().prepare_tensors()
 7694
 7695        if self._experts is not None:
 7696            # flatten `list[dict[str, Tensor]]` into `list[str]`
 7697            experts = [k for d in self._experts for k in d.keys()]
 7698            if len(experts) > 0:
 7699                raise ValueError(f"Unprocessed experts: {experts}")
 7700
 7701
 7702@ModelBase.register(
 7703    "DeepseekV2ForCausalLM",
 7704    "DeepseekV3ForCausalLM",
 7705    "KimiVLForConditionalGeneration",
 7706    "KimiK25ForConditionalGeneration",
 7707    "YoutuForCausalLM",
 7708    "YoutuVLForConditionalGeneration",
 7709)
 7710class DeepseekV2Model(TextModel):
 7711    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
 7712
 7713    def set_vocab(self):
 7714        try:
 7715            self._set_vocab_gpt2()
 7716            return
 7717        except Exception:
 7718            pass
 7719
 7720        from transformers import AutoTokenizer
 7721        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
 7722        tokpre = self.get_vocab_base_pre(tokenizer)
 7723
 7724        if tokpre == "kimi-k2":
 7725            # Build merges list using the approach similar to HunYuanMoE
 7726            merges = []
 7727            vocab = {}
 7728            mergeable_ranks = tokenizer.model._mergeable_ranks
 7729            for token, rank in mergeable_ranks.items():
 7730                vocab[QwenModel.token_bytes_to_string(token)] = rank
 7731                if len(token) == 1:
 7732                    continue
 7733                merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
 7734                if len(merged) == 2:
 7735                    merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
 7736
 7737            # Build token list
 7738            vocab_size = self.hparams["vocab_size"]
 7739            special_tokens = tokenizer.special_tokens
 7740            reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
 7741            tokens: list[str] = []
 7742            toktypes: list[int] = []
 7743
 7744            for i in range(vocab_size):
 7745                if i not in reverse_vocab:
 7746                    tokens.append(f"[PAD{i}]")
 7747                    toktypes.append(gguf.TokenType.UNUSED)
 7748                else:
 7749                    token = reverse_vocab[i]
 7750                    tokens.append(token)
 7751                    if i in special_tokens.values():
 7752                        toktypes.append(gguf.TokenType.CONTROL)
 7753                    else:
 7754                        toktypes.append(gguf.TokenType.NORMAL)
 7755
 7756            self.gguf_writer.add_tokenizer_model("gpt2")
 7757            self.gguf_writer.add_tokenizer_pre(tokpre)
 7758            self.gguf_writer.add_token_list(tokens)
 7759            self.gguf_writer.add_token_types(toktypes)
 7760            self.gguf_writer.add_token_merges(merges)
 7761
 7762            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
 7763            special_vocab.add_to_gguf(self.gguf_writer)
 7764        else:
 7765            raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
 7766
 7767    def set_gguf_parameters(self):
 7768
 7769        # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
 7770        self.hparams["num_key_value_heads"] = 1
 7771
 7772        super().set_gguf_parameters()
 7773        hparams = self.hparams
 7774
 7775        # first_k_dense_replace: number of leading layers using dense FFN instead of MoE
 7776        # For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers
 7777        # For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers
 7778        has_moe = hparams.get("n_routed_experts") is not None
 7779        first_k_dense_replace = hparams.get("first_k_dense_replace")
 7780        if first_k_dense_replace is None:
 7781            # Default: if no MoE, all layers are dense; if MoE, none are dense
 7782            first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
 7783        self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
 7784        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 7785        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
 7786            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
 7787        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
 7788
 7789        # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
 7790        self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
 7791        self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
 7792        self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
 7793        self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
 7794
 7795        # MoE parameters (required by C++ code for DEEPSEEK2 arch)
 7796        # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
 7797        moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False)
 7798        self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
 7799
 7800        if (n_routed_experts := hparams.get("n_routed_experts")) is not None:
 7801            self.gguf_writer.add_expert_count(n_routed_experts)
 7802
 7803        # expert_shared_count is required by C++ code, default to 0 for non-MoE models
 7804        n_shared_experts = hparams.get("n_shared_experts", 0)
 7805        self.gguf_writer.add_expert_shared_count(n_shared_experts)
 7806
 7807        # When not set, C++ code will use scale_w = false to skip the no-op scaling
 7808        if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None:
 7809            self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
 7810
 7811        if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
 7812            self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
 7813
 7814        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
 7815
 7816        if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
 7817            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
 7818            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
 7819            # ref https://github.com/ggml-org/llama.cpp/pull/17945
 7820            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all)
 7821
 7822    _experts: list[dict[str, Tensor]] | None = None
 7823
 7824    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 7825        # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5
 7826        if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name:
 7827            return
 7828        if name.startswith("siglip2.") or name.startswith("merger."):
 7829            return
 7830        if name.startswith("language_model."):
 7831            name = name.replace("language_model.", "")
 7832
 7833        # skip lm_head.weight if tie_word_embeddings is True
 7834        if self.hparams.get("tie_word_embeddings", False):
 7835            if name == "lm_head.weight" or name == "model.lm_head.weight":
 7836                logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)")
 7837                return
 7838
 7839        # rename e_score_correction_bias tensors
 7840        if name.endswith("e_score_correction_bias"):
 7841            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 7842
 7843        # skip Multi-Token Prediction (MTP) layers
 7844        block_count = self.hparams["num_hidden_layers"]
 7845        match = re.match(r"model.layers.(\d+)", name)
 7846        if match and int(match.group(1)) >= block_count:
 7847            return
 7848
 7849        # process the experts separately
 7850        if name.find("mlp.experts") != -1:
 7851            n_experts = self.hparams["n_routed_experts"]
 7852            assert bid is not None
 7853
 7854            if self._experts is None:
 7855                self._experts = [{} for _ in range(self.block_count)]
 7856
 7857            self._experts[bid][name] = data_torch
 7858
 7859            if len(self._experts[bid]) >= n_experts * 3:
 7860                # merge the experts into a single 3d tensor
 7861                for w_name in ["down_proj", "gate_proj", "up_proj"]:
 7862                    datas: list[Tensor] = []
 7863
 7864                    for xid in range(n_experts):
 7865                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 7866                        datas.append(self._experts[bid][ename])
 7867                        del self._experts[bid][ename]
 7868
 7869                    data_torch = torch.stack(datas, dim=0)
 7870
 7871                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 7872
 7873                    yield from super().modify_tensors(data_torch, merged_name, bid)
 7874                return
 7875            else:
 7876                return
 7877
 7878        # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
 7879        if name.endswith("kv_b_proj.weight"):
 7880            name_kb = name.replace("kv_b_proj", "k_b_proj")
 7881            name_vb = name.replace("kv_b_proj", "v_b_proj")
 7882
 7883            n_head_kv = self.hparams["num_key_value_heads"]
 7884            v_head_dim = self.hparams["v_head_dim"]
 7885            qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
 7886
 7887            assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
 7888
 7889            kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
 7890            k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
 7891            k_b = k_b.transpose(1, 2)
 7892
 7893            yield from super().modify_tensors(k_b, name_kb, bid)
 7894            yield from super().modify_tensors(v_b, name_vb, bid)
 7895            return
 7896
 7897        yield from super().modify_tensors(data_torch, name, bid)
 7898
 7899    def prepare_tensors(self):
 7900        super().prepare_tensors()
 7901
 7902        if self._experts is not None:
 7903            # flatten `list[dict[str, Tensor]]` into `list[str]`
 7904            experts = [k for d in self._experts for k in d.keys()]
 7905            if len(experts) > 0:
 7906                raise ValueError(f"Unprocessed experts: {experts}")
 7907
 7908
 7909@ModelBase.register("MiniMaxM2ForCausalLM")
 7910class MiniMaxM2Model(TextModel):
 7911    model_arch = gguf.MODEL_ARCH.MINIMAXM2
 7912    _experts_cache: dict[int, dict[str, Tensor]] = {}
 7913
 7914    def __init__(self, *args, **kwargs):
 7915        super().__init__(*args, **kwargs)
 7916        self.hparams["num_experts"] = self.hparams["num_local_experts"]
 7917
 7918    def set_gguf_parameters(self):
 7919        super().set_gguf_parameters()
 7920
 7921        self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"]))
 7922        self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"]))
 7923
 7924    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
 7925        if name.endswith("e_score_correction_bias"):
 7926            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 7927
 7928        # merge expert weights
 7929        if 'experts' in name:
 7930            n_experts = self.hparams["num_experts"]
 7931            assert bid is not None
 7932
 7933            expert_cache = self._experts_cache.setdefault(bid, {})
 7934            expert_cache[name] = data_torch
 7935            expert_weights = ["w1", "w2", "w3"]
 7936
 7937            # not enough expert weights to merge
 7938            if len(expert_cache) < n_experts * len(expert_weights):
 7939                return
 7940
 7941            for w_name in expert_weights:
 7942                datas: list[Tensor] = []
 7943
 7944                for xid in range(n_experts):
 7945                    ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
 7946                    datas.append(expert_cache[ename])
 7947                    del expert_cache[ename]
 7948
 7949                data_torch = torch.stack(datas, dim=0)
 7950                merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
 7951                new_name = self.map_tensor_name(merged_name)
 7952                yield from super().modify_tensors(data_torch, new_name, bid)
 7953
 7954            del self._experts_cache[bid]
 7955            return
 7956
 7957        yield from super().modify_tensors(data_torch, name, bid)
 7958
 7959
 7960@ModelBase.register("MiMoV2FlashForCausalLM")
 7961class MimoV2Model(TextModel):
 7962    model_arch = gguf.MODEL_ARCH.MIMO2
 7963
 7964    def set_gguf_parameters(self):
 7965        super().set_gguf_parameters()
 7966
 7967        assert self.hparams["swa_head_dim"] == self.hparams["head_dim"]
 7968        assert self.hparams["swa_num_attention_heads"] == self.hparams["num_attention_heads"]
 7969        assert self.hparams["swa_v_head_dim"] == self.hparams["v_head_dim"]
 7970        assert self.hparams["topk_method"] == "noaux_tc"
 7971
 7972        n_head_kv = self.hparams["num_key_value_heads"]
 7973        n_head_kv_swa = self.hparams["swa_num_key_value_heads"]
 7974        n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in self.hparams["hybrid_layer_pattern"]]
 7975        self.gguf_writer.add_head_count_kv(n_head_kv_arr)
 7976
 7977        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
 7978        self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"])
 7979        self.gguf_writer.add_value_length(self.hparams["v_head_dim"])
 7980        self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
 7981        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
 7982
 7983        rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
 7984        self.gguf_writer.add_rope_dimension_count(rope_dim)
 7985
 7986        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
 7987
 7988    _experts: list[dict[str, Tensor]] | None = None
 7989
 7990    def modify_tensors(self, data_torch, name, bid):
 7991        if name.endswith("e_score_correction_bias"):
 7992            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 7993
 7994        if "attention_sink" in name and not name.endswith(".weight"):
 7995            name += ".weight"
 7996
 7997        # TODO: mimo v2 does not indicate the number of next-token-prediction layers, therefore we cannot do the same way as GLM4_MOE
 7998        if "model.mtp." in name:
 7999            return
 8000
 8001        # process the experts separately
 8002        if name.find("mlp.experts") != -1:
 8003            n_experts = self.hparams["n_routed_experts"]
 8004            assert bid is not None
 8005
 8006            if self._experts is None:
 8007                self._experts = [{} for _ in range(self.block_count)]
 8008
 8009            self._experts[bid][name] = data_torch
 8010
 8011            if len(self._experts[bid]) >= n_experts * 3:
 8012                # merge the experts into a single 3d tensor
 8013                for w_name in ["gate_proj", "up_proj", "down_proj"]:
 8014                    datas: list[Tensor] = []
 8015
 8016                    for xid in range(n_experts):
 8017                        ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 8018                        datas.append(self._experts[bid][ename_to_retrieve])
 8019                        del self._experts[bid][ename_to_retrieve]
 8020
 8021                    data_torch = torch.stack(datas, dim=0)
 8022                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 8023
 8024                    yield from super().modify_tensors(data_torch, merged_name, bid)
 8025                return
 8026            else:
 8027                return
 8028        yield from super().modify_tensors(data_torch, name, bid)
 8029
 8030    def prepare_tensors(self):
 8031        super().prepare_tensors()
 8032
 8033        if self._experts is not None:
 8034            # flatten `list[dict[str, Tensor]]` into `list[str]`
 8035            experts = [k for d in self._experts for k in d.keys()]
 8036            if len(experts) > 0:
 8037                raise ValueError(f"Unprocessed experts: {experts}")
 8038
 8039
 8040@ModelBase.register("Step3p5ForCausalLM")
 8041class Step35Model(TextModel):
 8042    model_arch = gguf.MODEL_ARCH.STEP35
 8043
 8044    def set_gguf_parameters(self):
 8045        rope_theta = self.hparams.get("rope_theta")
 8046        if isinstance(rope_theta, list):
 8047            self.hparams["rope_theta"] = float(rope_theta[0])
 8048            self.hparams["local_rope_theta"] = float(rope_theta[1])
 8049            self.rope_parameters["rope_theta"] = self.hparams["rope_theta"]
 8050            self.rope_parameters["sliding_attention"] = {"rope_theta": self.hparams["local_rope_theta"]}
 8051
 8052        super().set_gguf_parameters()
 8053
 8054        layer_types = self.hparams.get("layer_types") or []
 8055        partial_rotary_factors = self.hparams.get("partial_rotary_factors") or []
 8056        attn_other = self.hparams.get("attention_other_setting") or {}
 8057
 8058        n_head_base = self.hparams["num_attention_heads"]
 8059        n_kv_base = self.hparams["num_attention_groups"]
 8060
 8061        n_head_swa = attn_other.get("num_attention_heads", n_head_base)
 8062        n_kv_swa = attn_other.get("num_attention_groups", n_kv_base)
 8063
 8064        layer_types = layer_types[: self.block_count]
 8065        partial_rotary_factors = partial_rotary_factors[: self.block_count]
 8066        assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors
 8067        head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types]
 8068        kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
 8069        swa_pat = [lt == "sliding_attention" for lt in layer_types]
 8070
 8071        self.gguf_writer.add_head_count(head_arr)
 8072        self.gguf_writer.add_head_count_kv(kv_arr)
 8073
 8074        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
 8075        self.gguf_writer.add_sliding_window_pattern(swa_pat)
 8076
 8077        self.gguf_writer.add_value_length(self.hparams["head_dim"])
 8078
 8079        # MoE params
 8080        self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
 8081        self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
 8082        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
 8083        self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["share_expert_dim"])
 8084
 8085        if (moe_router_scaling_factor := self.hparams.get("moe_router_scaling_factor")) is not None:
 8086            self.gguf_writer.add_expert_weights_scale(moe_router_scaling_factor)
 8087        if (norm_expert_weight := self.hparams.get("norm_expert_weight")) is not None:
 8088            self.gguf_writer.add_expert_weights_norm(norm_expert_weight)
 8089
 8090        # leading dense blocks
 8091        leading_dense = 0
 8092        moe_layers_enum = self.hparams.get("moe_layers_enum")
 8093        if isinstance(moe_layers_enum, str) and moe_layers_enum.strip():
 8094            moe_layers = sorted(int(i) for i in moe_layers_enum.strip().split(","))
 8095            if moe_layers:
 8096                leading_dense = max(0, moe_layers[0])
 8097        self.gguf_writer.add_leading_dense_block_count(leading_dense)
 8098        self.gguf_writer.add_moe_every_n_layers(int(self.hparams.get("moe_every_n_layer", 1)))
 8099
 8100        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
 8101
 8102        # Optional per-layer SwiGLU clamps.
 8103        if (limits := self.hparams.get("swiglu_limits")) is not None:
 8104            limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]]
 8105            self.gguf_writer.add_swiglu_clamp_exp(limits_f)
 8106        if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None:
 8107            limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]]
 8108            self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f)
 8109
 8110    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
 8111        # remove mtp layers
 8112        if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
 8113            il = int(m.group(1))
 8114            n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
 8115            if il >= n_main:
 8116                return
 8117        if name.endswith("norm.weight"):
 8118            data_torch += 1.0
 8119        # Map router bias (expert selection bias) to a GGUF bias tensor
 8120        if name.endswith(".moe.router_bias"):
 8121            name += ".bias"
 8122
 8123        if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")):
 8124            data_torch = data_torch.squeeze().contiguous()
 8125
 8126        yield from super().modify_tensors(data_torch, name, bid)
 8127
 8128    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 8129        # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3").
 8130        # llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS).
 8131        rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
 8132        rope_type = rope_params.get("rope_type") or ""
 8133        if rope_type.lower() != "llama3":
 8134            return
 8135
 8136        # Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value.
 8137        rope_theta = self.hparams.get("rope_theta", 10000.0)
 8138        if isinstance(rope_theta, list):
 8139            rope_theta = rope_theta[0]
 8140        base = float(rope_theta)
 8141        if (dim := self.hparams.get("head_dim")) is None:
 8142            dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 8143        dim = int(dim)
 8144
 8145        freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
 8146
 8147        factor = float(rope_params.get("factor", 8.0))
 8148        low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
 8149        high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
 8150        old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
 8151
 8152        low_freq_wavelen = old_context_len / low_freq_factor
 8153        high_freq_wavelen = old_context_len / high_freq_factor
 8154
 8155        rope_factors: list[float] = []
 8156        for freq in freqs:
 8157            wavelen = 2 * math.pi / float(freq)
 8158            if wavelen < high_freq_wavelen:
 8159                rope_factors.append(1.0)
 8160            elif wavelen > low_freq_wavelen:
 8161                rope_factors.append(factor)
 8162            else:
 8163                smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
 8164                rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth))
 8165
 8166        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
 8167
 8168
 8169@ModelBase.register("PanguEmbeddedForCausalLM")
 8170class PanguEmbeddedModel(TextModel):
 8171    model_arch = gguf.MODEL_ARCH.PANGU_EMBED
 8172
 8173    def set_vocab(self):
 8174        self._set_vocab_sentencepiece()
 8175
 8176        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
 8177        if tokenizer_config_file.is_file():
 8178            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
 8179                tokenizer_config_json = json.load(f)
 8180                if "add_prefix_space" in tokenizer_config_json:
 8181                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
 8182
 8183    def set_gguf_parameters(self):
 8184        super().set_gguf_parameters()
 8185        hparams = self.hparams
 8186        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 8187
 8188        # PanguEmbedded's hparam loaded from config.json without head_dim
 8189        if (rope_dim := hparams.get("head_dim")) is None:
 8190            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
 8191        self.gguf_writer.add_rope_dimension_count(rope_dim)
 8192
 8193        if hparams.get("head_dim") is None:
 8194            self.gguf_writer.add_key_length(rope_dim)
 8195            self.gguf_writer.add_value_length(rope_dim)
 8196
 8197    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 8198        if name == "lm_head.weight":
 8199            if self.hparams.get("tie_word_embeddings", False):
 8200                logger.info("Skipping tied output layer 'lm_head.weight'")
 8201                return
 8202        yield from super().modify_tensors(data_torch, name, bid)
 8203
 8204
 8205@ModelBase.register("Dots1ForCausalLM")
 8206class Dots1Model(Qwen2MoeModel):
 8207    model_arch = gguf.MODEL_ARCH.DOTS1
 8208
 8209    def __init__(self, *args, **kwargs):
 8210        super().__init__(*args, **kwargs)
 8211        self.hparams["num_experts"] = self.hparams["n_routed_experts"]
 8212
 8213    def set_gguf_parameters(self):
 8214        super().set_gguf_parameters()
 8215        self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
 8216        self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
 8217        self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
 8218        self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
 8219
 8220    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
 8221        if name.endswith("e_score_correction_bias"):
 8222            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 8223        if "shared_experts" in name:
 8224            yield from ModelBase.modify_tensors(self, data_torch, name, bid)
 8225        else:
 8226            yield from super().modify_tensors(data_torch, name, bid)
 8227
 8228
 8229@ModelBase.register("PLMForCausalLM")
 8230class PLMModel(TextModel):
 8231    model_arch = gguf.MODEL_ARCH.PLM
 8232
 8233    def set_vocab(self):
 8234        self._set_vocab_gpt2()
 8235
 8236    def set_gguf_parameters(self):
 8237        super().set_gguf_parameters()
 8238        hparams = self.hparams
 8239        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 8240        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
 8241        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
 8242        self.gguf_writer.add_value_length(hparams["v_head_dim"])
 8243        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
 8244
 8245    def prepare_tensors(self):
 8246        super().prepare_tensors()
 8247
 8248
 8249@ModelBase.register("T5WithLMHeadModel")
 8250@ModelBase.register("T5ForConditionalGeneration")
 8251@ModelBase.register("MT5ForConditionalGeneration")
 8252@ModelBase.register("UMT5ForConditionalGeneration")
 8253@ModelBase.register("UMT5Model")
 8254class T5Model(TextModel):
 8255    model_arch = gguf.MODEL_ARCH.T5
 8256
 8257    def __init__(self, *args, **kwargs):
 8258        super().__init__(*args, **kwargs)
 8259        self.shared_token_embeddings_found = False
 8260
 8261    def set_vocab(self):
 8262        # to avoid TypeError: Descriptors cannot be created directly
 8263        # exception when importing sentencepiece_model_pb2
 8264        os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
 8265        from sentencepiece import SentencePieceProcessor
 8266        from sentencepiece import sentencepiece_model_pb2 as model
 8267
 8268        tokenizer_path = self.dir_model / 'tokenizer.model'
 8269
 8270        # many older models use spiece.model tokenizer model filename
 8271        if not tokenizer_path.is_file():
 8272            tokenizer_path = self.dir_model / 'spiece.model'
 8273
 8274        if not tokenizer_path.is_file():
 8275            raise FileNotFoundError(f"File not found: {tokenizer_path}")
 8276
 8277        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
 8278        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 8279
 8280        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
 8281        if sentencepiece_model.trainer_spec.model_type == 2:  # BPE
 8282            # assure the tokenizer model file name is correct
 8283            assert tokenizer_path.name == 'tokenizer.model'
 8284            return self._set_vocab_sentencepiece()
 8285        else:
 8286            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
 8287
 8288        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
 8289        remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
 8290        precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
 8291
 8292        tokenizer = SentencePieceProcessor()
 8293        tokenizer.LoadFromFile(str(tokenizer_path))
 8294
 8295        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
 8296
 8297        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
 8298        scores: list[float] = [-10000.0] * vocab_size
 8299        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
 8300
 8301        for token_id in range(tokenizer.vocab_size()):
 8302            piece = tokenizer.IdToPiece(token_id)
 8303            text = piece.encode("utf-8")
 8304            score = tokenizer.GetScore(token_id)
 8305
 8306            toktype = SentencePieceTokenTypes.NORMAL
 8307            if tokenizer.IsUnknown(token_id):
 8308                toktype = SentencePieceTokenTypes.UNKNOWN
 8309            elif tokenizer.IsControl(token_id):
 8310                toktype = SentencePieceTokenTypes.CONTROL
 8311            elif tokenizer.IsUnused(token_id):
 8312                toktype = SentencePieceTokenTypes.UNUSED
 8313            elif tokenizer.IsByte(token_id):
 8314                toktype = SentencePieceTokenTypes.BYTE
 8315
 8316            tokens[token_id] = text
 8317            scores[token_id] = score
 8318            toktypes[token_id] = toktype
 8319
 8320        added_tokens_file = self.dir_model / 'added_tokens.json'
 8321        if added_tokens_file.is_file():
 8322            with open(added_tokens_file, "r", encoding="utf-8") as f:
 8323                added_tokens_json = json.load(f)
 8324                for key in added_tokens_json:
 8325                    token_id = added_tokens_json[key]
 8326                    if token_id >= vocab_size:
 8327                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
 8328                        continue
 8329
 8330                    tokens[token_id] = key.encode("utf-8")
 8331                    scores[token_id] = -1000.0
 8332                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 8333
 8334        if vocab_size > len(tokens):
 8335            pad_count = vocab_size - len(tokens)
 8336            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
 8337            for i in range(1, pad_count + 1):
 8338                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
 8339                scores.append(-1000.0)
 8340                toktypes.append(SentencePieceTokenTypes.UNUSED)
 8341
 8342        self.gguf_writer.add_tokenizer_model("t5")
 8343        self.gguf_writer.add_tokenizer_pre("default")
 8344        self.gguf_writer.add_token_list(tokens)
 8345        self.gguf_writer.add_token_scores(scores)
 8346        self.gguf_writer.add_token_types(toktypes)
 8347        self.gguf_writer.add_add_space_prefix(add_prefix)
 8348        self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
 8349        if precompiled_charsmap:
 8350            self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
 8351
 8352        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 8353        special_vocab.add_to_gguf(self.gguf_writer)
 8354
 8355    def set_gguf_parameters(self):
 8356        if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
 8357            logger.warning("Couldn't find context length in config.json, assuming default value of 512")
 8358            n_ctx = 512
 8359        self.gguf_writer.add_context_length(n_ctx)
 8360        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
 8361        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
 8362        self.gguf_writer.add_block_count(self.block_count)
 8363        if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None:
 8364            self.gguf_writer.add_decoder_block_count(dec_n_layer)
 8365        self.gguf_writer.add_head_count(self.hparams["num_heads"])
 8366        self.gguf_writer.add_key_length(self.hparams["d_kv"])
 8367        self.gguf_writer.add_value_length(self.hparams["d_kv"])
 8368        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
 8369        self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
 8370        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
 8371        self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
 8372        self.gguf_writer.add_file_type(self.ftype)
 8373
 8374    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 8375        # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
 8376        # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
 8377        # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
 8378        # and decoder and ignore the remaining ones.
 8379        if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
 8380            if not self.shared_token_embeddings_found:
 8381                name = "shared.weight"
 8382                self.shared_token_embeddings_found = True
 8383            else:
 8384                logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
 8385                return
 8386
 8387        yield from super().modify_tensors(data_torch, name, bid)
 8388
 8389
 8390@ModelBase.register("T5EncoderModel")
 8391class T5EncoderModel(TextModel):
 8392    model_arch = gguf.MODEL_ARCH.T5ENCODER
 8393
 8394    def __init__(self, *args, **kwargs):
 8395        super().__init__(*args, **kwargs)
 8396        self.shared_token_embeddings_found = False
 8397
 8398    def set_vocab(self):
 8399        # to avoid TypeError: Descriptors cannot be created directly
 8400        # exception when importing sentencepiece_model_pb2
 8401        os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
 8402        from sentencepiece import SentencePieceProcessor
 8403        from sentencepiece import sentencepiece_model_pb2 as model
 8404
 8405        tokenizer_path = self.dir_model / 'tokenizer.model'
 8406
 8407        # many older models use spiece.model tokenizer model filename
 8408        if not tokenizer_path.is_file():
 8409            tokenizer_path = self.dir_model / 'spiece.model'
 8410
 8411        if not tokenizer_path.is_file():
 8412            raise FileNotFoundError(f"File not found: {tokenizer_path}")
 8413
 8414        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
 8415        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 8416
 8417        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
 8418        if sentencepiece_model.trainer_spec.model_type == 2:  # BPE
 8419            # assure the tokenizer model file name is correct
 8420            assert tokenizer_path.name == 'tokenizer.model'
 8421            return self._set_vocab_sentencepiece()
 8422        else:
 8423            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
 8424
 8425        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
 8426        remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
 8427        precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
 8428
 8429        tokenizer = SentencePieceProcessor()
 8430        tokenizer.LoadFromFile(str(tokenizer_path))
 8431
 8432        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
 8433
 8434        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
 8435        scores: list[float] = [-10000.0] * vocab_size
 8436        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
 8437
 8438        for token_id in range(tokenizer.vocab_size()):
 8439            piece = tokenizer.IdToPiece(token_id)
 8440            text = piece.encode("utf-8")
 8441            score = tokenizer.GetScore(token_id)
 8442
 8443            toktype = SentencePieceTokenTypes.NORMAL
 8444            if tokenizer.IsUnknown(token_id):
 8445                toktype = SentencePieceTokenTypes.UNKNOWN
 8446            elif tokenizer.IsControl(token_id):
 8447                toktype = SentencePieceTokenTypes.CONTROL
 8448            elif tokenizer.IsUnused(token_id):
 8449                toktype = SentencePieceTokenTypes.UNUSED
 8450            elif tokenizer.IsByte(token_id):
 8451                toktype = SentencePieceTokenTypes.BYTE
 8452
 8453            tokens[token_id] = text
 8454            scores[token_id] = score
 8455            toktypes[token_id] = toktype
 8456
 8457        added_tokens_file = self.dir_model / 'added_tokens.json'
 8458        if added_tokens_file.is_file():
 8459            with open(added_tokens_file, "r", encoding="utf-8") as f:
 8460                added_tokens_json = json.load(f)
 8461                for key in added_tokens_json:
 8462                    token_id = added_tokens_json[key]
 8463                    if token_id >= vocab_size:
 8464                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
 8465                        continue
 8466
 8467                    tokens[token_id] = key.encode("utf-8")
 8468                    scores[token_id] = -1000.0
 8469                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 8470
 8471        if vocab_size > len(tokens):
 8472            pad_count = vocab_size - len(tokens)
 8473            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
 8474            for i in range(1, pad_count + 1):
 8475                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
 8476                scores.append(-1000.0)
 8477                toktypes.append(SentencePieceTokenTypes.UNUSED)
 8478
 8479        self.gguf_writer.add_tokenizer_model("t5")
 8480        self.gguf_writer.add_tokenizer_pre("default")
 8481        self.gguf_writer.add_token_list(tokens)
 8482        self.gguf_writer.add_token_scores(scores)
 8483        self.gguf_writer.add_token_types(toktypes)
 8484        self.gguf_writer.add_add_space_prefix(add_prefix)
 8485        self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
 8486        if precompiled_charsmap:
 8487            self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
 8488
 8489        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 8490        special_vocab.add_to_gguf(self.gguf_writer)
 8491
 8492    def set_gguf_parameters(self):
 8493        if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
 8494            logger.warning("Couldn't find context length in config.json, assuming default value of 512")
 8495            n_ctx = 512
 8496        self.gguf_writer.add_context_length(n_ctx)
 8497        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
 8498        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
 8499        self.gguf_writer.add_block_count(self.block_count)
 8500        self.gguf_writer.add_head_count(self.hparams["num_heads"])
 8501        self.gguf_writer.add_key_length(self.hparams["d_kv"])
 8502        self.gguf_writer.add_value_length(self.hparams["d_kv"])
 8503        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
 8504        self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
 8505        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
 8506        self.gguf_writer.add_file_type(self.ftype)
 8507
 8508    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 8509        # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
 8510        # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
 8511        # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
 8512        # and decoder and ignore the remaining ones.
 8513        if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
 8514            if not self.shared_token_embeddings_found:
 8515                name = "shared.weight"
 8516                self.shared_token_embeddings_found = True
 8517            else:
 8518                logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
 8519                return
 8520
 8521        yield from super().modify_tensors(data_torch, name, bid)
 8522
 8523
 8524@ModelBase.register("JAISLMHeadModel")
 8525class JaisModel(TextModel):
 8526    model_arch = gguf.MODEL_ARCH.JAIS
 8527
 8528    def __init__(self, *args, **kwargs):
 8529        super().__init__(*args, **kwargs)
 8530
 8531        # SwigLU activation
 8532        assert self.hparams["activation_function"] == "swiglu"
 8533        # ALiBi position embedding
 8534        assert self.hparams["position_embedding_type"] == "alibi"
 8535
 8536        # Embeddings scale
 8537        self.embeddings_scale = 1.0
 8538        if 'mup_embeddings_scale' in self.hparams:
 8539            self.embeddings_scale = self.hparams['mup_embeddings_scale']
 8540        elif 'embeddings_scale' in self.hparams:
 8541            self.embeddings_scale = self.hparams['embeddings_scale']
 8542        else:
 8543            assert False
 8544
 8545        self.width_scale = 1.0
 8546        if 'mup_output_alpha' in self.hparams:
 8547            assert 'mup_width_scale' in self.hparams
 8548            self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
 8549        elif 'width_scale' in self.hparams:
 8550            self.width_scale = self.hparams['width_scale']
 8551        else:
 8552            assert False
 8553
 8554        self.max_alibi_bias = 8.0
 8555
 8556    def set_vocab(self):
 8557        self._set_vocab_gpt2()
 8558
 8559    def set_gguf_parameters(self):
 8560        self.gguf_writer.add_block_count(self.block_count)
 8561        self.gguf_writer.add_context_length(self.hparams["n_positions"])
 8562        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
 8563        self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
 8564        self.gguf_writer.add_head_count(self.hparams["n_head"])
 8565        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
 8566        self.gguf_writer.add_file_type(self.ftype)
 8567
 8568    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 8569        # we don't need these
 8570        if name.endswith((".attn.bias")):
 8571            return
 8572
 8573        if name.endswith(("relative_pe.slopes")):
 8574            # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
 8575            # Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
 8576            # but Jais's PyTorch model simply precalculates the slope values and places them
 8577            # in relative_pes.slopes
 8578            n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
 8579            first_val = float(data_torch[0].item())
 8580            self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
 8581
 8582            return
 8583
 8584        if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
 8585            data_torch = data_torch.transpose(1, 0)
 8586
 8587        new_name = self.map_tensor_name(name)
 8588
 8589        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
 8590            yield from super().modify_tensors(data_torch * self.embeddings_scale, new_name, bid)
 8591        elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
 8592            yield from super().modify_tensors(data_torch * self.width_scale, new_name, bid)
 8593        else:
 8594            yield from super().modify_tensors(data_torch, new_name, bid)
 8595
 8596    def prepare_tensors(self):
 8597        super().prepare_tensors()
 8598        self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
 8599
 8600
 8601@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
 8602class Glm4Model(TextModel):
 8603    model_arch = gguf.MODEL_ARCH.GLM4
 8604    use_mrope = False
 8605    partial_rotary_factor = 0.5
 8606
 8607    def __init__(self, *args, **kwargs):
 8608        super().__init__(*args, **kwargs)
 8609        self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5)
 8610        if "mrope_section" in self.rope_parameters:
 8611            self.use_mrope = True
 8612            logger.info("Q/K weight will need to be permuted for M-RoPE")
 8613
 8614    def set_vocab(self):
 8615        from transformers import AutoTokenizer
 8616        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
 8617        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
 8618        tokens, toktypes, tokpre = self.get_vocab_base()
 8619        self.gguf_writer.add_tokenizer_model("gpt2")
 8620        self.gguf_writer.add_tokenizer_pre(tokpre)
 8621        self.gguf_writer.add_token_list(tokens)
 8622        self.gguf_writer.add_token_types(toktypes)
 8623        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
 8624        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
 8625        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
 8626        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
 8627        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
 8628        special_vocab.add_to_gguf(self.gguf_writer)
 8629
 8630    def set_gguf_parameters(self):
 8631        super().set_gguf_parameters()
 8632        if (rope_dim := self.hparams.get("head_dim")) is None:
 8633            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 8634        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor))
 8635
 8636    @staticmethod
 8637    def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor:
 8638        orig_shape = weights.shape
 8639        if len(orig_shape) == 1:
 8640            weights = weights.unsqueeze(1)  # [out_dim, 1]
 8641        if len(weights.shape) != 2:
 8642            raise ValueError("Only 1D and 2D tensors are supported.")
 8643        n_effective_heads = weights.shape[0] // head_dim
 8644        if n_head_kv is not None and n_effective_heads != n_head:
 8645            if n_effective_heads != n_head_kv:
 8646                raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}")
 8647        rotary_dim = int(head_dim * partial_rotary_factor)
 8648        if rotary_dim % 2 != 0:
 8649            raise ValueError("rotary_dim must be even.")
 8650        reshaped = weights.reshape(n_effective_heads, head_dim, -1)
 8651        rot_part = reshaped[:, :rotary_dim, :]
 8652        non_rot_part = reshaped[:, rotary_dim:, :]
 8653        permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1)
 8654        combined = torch.cat((permuted_rot, non_rot_part), dim=1)
 8655        result = combined.reshape(weights.shape)
 8656        return result if len(orig_shape) != 1 else result.squeeze(1)
 8657
 8658    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 8659        if name.startswith("model.visual."): # ignore visual part of Glm4v
 8660            return
 8661        elif name.startswith("model.language_model."):
 8662            name = name.replace("language_model.", "") # for Glm4v
 8663        if self.use_mrope:
 8664            n_head = self.hparams["num_attention_heads"]
 8665            n_kv_head = self.hparams["num_key_value_heads"]
 8666            n_embd = self.hparams["hidden_size"]
 8667            head_dim = n_embd // n_head
 8668            # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
 8669            if name.endswith(("q_proj.weight", "q_proj.bias")):
 8670                data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
 8671            if name.endswith(("k_proj.weight", "k_proj.bias")):
 8672                data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor)
 8673        yield from super().modify_tensors(data_torch, name, bid)
 8674
 8675
 8676@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
 8677class Glm4MoeModel(TextModel):
 8678    model_arch = gguf.MODEL_ARCH.GLM4_MOE
 8679
 8680    def __init__(self, *args, **kwargs):
 8681        super().__init__(*args, **kwargs)
 8682        # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
 8683        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
 8684        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
 8685
 8686    def set_vocab(self):
 8687        from transformers import AutoTokenizer
 8688
 8689        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
 8690        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
 8691        tokens, toktypes, tokpre = self.get_vocab_base()
 8692        self.gguf_writer.add_tokenizer_model("gpt2")
 8693        self.gguf_writer.add_tokenizer_pre(tokpre)
 8694        self.gguf_writer.add_token_list(tokens)
 8695        self.gguf_writer.add_token_types(toktypes)
 8696
 8697        # Special tokens
 8698        # Note: Using <|endoftext|> (151329) for eot causes endless generation
 8699        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"])  # 151331
 8700        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])  # 151336
 8701        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
 8702        special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"])  # 151338
 8703
 8704        special_vocab.add_to_gguf(self.gguf_writer)
 8705
 8706    def set_gguf_parameters(self):
 8707        super().set_gguf_parameters()
 8708        if (rope_dim := self.hparams.get("head_dim")) is None:
 8709            rope_dim = (
 8710                self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 8711            )
 8712        self.gguf_writer.add_rope_dimension_count(
 8713            int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
 8714        )
 8715
 8716        # MoE parameters - Use only routed expert count (shared experts handled separately)
 8717        if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None:
 8718            self.gguf_writer.add_expert_count(n_routed_experts)
 8719        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
 8720            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
 8721        if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None:
 8722            self.gguf_writer.add_expert_shared_count(n_shared_experts)
 8723        if (first_k_dense_replace := self.hparams.get("first_k_dense_replace")) is not None:
 8724            self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
 8725
 8726        # Expert gating function (sigmoid for GLM4_MOE)
 8727        self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
 8728
 8729        # Routed scaling factor
 8730        if (routed_scaling_factor := self.hparams.get("routed_scaling_factor")) is not None:
 8731            self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
 8732
 8733        # Normalise topk probabilities
 8734        if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None:
 8735            self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
 8736
 8737        # NextN/MTP prediction layers
 8738        if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
 8739            self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
 8740
 8741    _experts: list[dict[str, Tensor]] | None = None
 8742
 8743    # note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already
 8744    def modify_tensors(
 8745        self, data_torch: Tensor, name: str, bid: int | None
 8746    ) -> Iterable[tuple[str, Tensor]]:
 8747        if name.startswith("model.visual."):  # ignore visual part
 8748            return
 8749        elif name.startswith("model.language_model."):
 8750            name = name.replace("language_model.", "")  # for multimodal variants
 8751
 8752        # Handle main token embedding (but not layer-specific NextN embeddings)
 8753        if name == "model.embed_tokens.weight" and ".layers." not in name:
 8754            yield from super().modify_tensors(data_torch, "token_embd.weight", bid)
 8755            return
 8756
 8757        # Handle routed experts
 8758        if name.find("mlp.experts") != -1:
 8759            n_experts = self.hparams["n_routed_experts"]
 8760            assert bid is not None
 8761
 8762            if self._experts is None:
 8763                self._experts = [{} for _ in range(self.block_count)]
 8764
 8765            self._experts[bid][name] = data_torch
 8766
 8767            if len(self._experts[bid]) >= n_experts * 3:
 8768                # merge the experts into a single 3d tensor
 8769                for w_name in ["down_proj", "gate_proj", "up_proj"]:
 8770                    datas: list[Tensor] = []
 8771
 8772                    for xid in range(n_experts):
 8773                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 8774                        datas.append(self._experts[bid][ename])
 8775                        del self._experts[bid][ename]
 8776
 8777                    data_torch = torch.stack(datas, dim=0)
 8778
 8779                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 8780
 8781                    yield from super().modify_tensors(data_torch, merged_name, bid)
 8782                return
 8783            else:
 8784                return
 8785
 8786        if name.endswith("e_score_correction_bias"):
 8787            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 8788
 8789        yield from super().modify_tensors(data_torch, name, bid)
 8790
 8791    def prepare_tensors(self):
 8792        super().prepare_tensors()
 8793        if self._experts is not None:
 8794            # flatten `list[dict[str, Tensor]]` into `list[str]`
 8795            experts = [k for d in self._experts for k in d.keys()]
 8796            if len(experts) > 0:
 8797                raise ValueError(f"Unprocessed experts: {experts}")
 8798
 8799
 8800@ModelBase.register("Glm4MoeLiteForCausalLM")
 8801class Glm4MoeLiteModel(DeepseekV2Model):
 8802    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
 8803
 8804    # copied from Glm4MoeModel
 8805    def set_vocab(self):
 8806        from transformers import AutoTokenizer
 8807
 8808        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
 8809        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
 8810        tokens, toktypes, tokpre = self.get_vocab_base()
 8811        self.gguf_writer.add_tokenizer_model("gpt2")
 8812        self.gguf_writer.add_tokenizer_pre(tokpre)
 8813        self.gguf_writer.add_token_list(tokens)
 8814        self.gguf_writer.add_token_types(toktypes)
 8815
 8816        # Special tokens
 8817        # Note: Using <|endoftext|> (151329) for eot causes endless generation
 8818        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"])  # 151331
 8819        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])  # 151336
 8820        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
 8821        special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"])  # 151338
 8822
 8823        special_vocab.add_to_gguf(self.gguf_writer)
 8824
 8825
 8826@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
 8827class ChatGLMModel(TextModel):
 8828    model_arch = gguf.MODEL_ARCH.CHATGLM
 8829
 8830    def set_vocab_chatglm3(self):
 8831        dir_model = self.dir_model
 8832        hparams = self.hparams
 8833        tokens: list[bytes] = []
 8834        toktypes: list[int] = []
 8835        scores: list[float] = []
 8836
 8837        from transformers import AutoTokenizer
 8838        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
 8839        vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
 8840        assert max(tokenizer.get_vocab().values()) < vocab_size
 8841        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
 8842        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
 8843        for token_id in range(vocab_size):
 8844            piece = tokenizer._convert_id_to_token(token_id)
 8845            if token_id == 0:
 8846                piece = "<unk>"
 8847            elif token_id == 1:
 8848                piece = "<bos>"
 8849            elif token_id == 2:
 8850                piece = "<eos>"
 8851
 8852            text = piece.encode("utf-8")
 8853            score = 0.0
 8854            # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
 8855            # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
 8856            if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
 8857                score = tokenizer.tokenizer.sp_model.get_score(token_id)
 8858
 8859            if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
 8860                if piece in special_tokens:
 8861                    toktype = SentencePieceTokenTypes.CONTROL
 8862                elif len(piece) == 0:
 8863                    text = f"[PAD{token_id}]".encode("utf-8")
 8864                    toktype = SentencePieceTokenTypes.UNUSED
 8865                else:
 8866                    toktype = SentencePieceTokenTypes.USER_DEFINED
 8867                tokens.append(text)
 8868                scores.append(score)
 8869                toktypes.append(toktype)
 8870                continue
 8871
 8872            toktype = SentencePieceTokenTypes.NORMAL
 8873            if tokenizer.tokenizer.sp_model.is_unknown(token_id):
 8874                toktype = SentencePieceTokenTypes.UNKNOWN
 8875            elif tokenizer.tokenizer.sp_model.is_control(token_id):
 8876                toktype = SentencePieceTokenTypes.CONTROL
 8877            elif tokenizer.tokenizer.sp_model.is_unused(token_id):
 8878                toktype = SentencePieceTokenTypes.UNUSED
 8879            elif tokenizer.tokenizer.sp_model.is_byte(token_id):
 8880                toktype = SentencePieceTokenTypes.BYTE
 8881
 8882            tokens.append(text)
 8883            scores.append(score)
 8884            toktypes.append(toktype)
 8885
 8886        self.gguf_writer.add_tokenizer_model("llama")
 8887        # glm3 needs prefix and suffix formatted as:
 8888        # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
 8889        self.gguf_writer.add_tokenizer_pre("chatglm-spm")
 8890        self.gguf_writer.add_token_list(tokens)
 8891        self.gguf_writer.add_token_scores(scores)
 8892        self.gguf_writer.add_token_types(toktypes)
 8893
 8894        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
 8895        special_vocab.add_to_gguf(self.gguf_writer)
 8896
 8897    @staticmethod
 8898    def token_bytes_to_string(b):
 8899        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
 8900        byte_encoder = bytes_to_unicode()
 8901        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
 8902
 8903    @staticmethod
 8904    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
 8905        parts = [bytes([b]) for b in token]
 8906        while True:
 8907            min_idx = None
 8908            min_rank = None
 8909            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
 8910                rank = mergeable_ranks.get(pair[0] + pair[1])
 8911                if rank is not None and (min_rank is None or rank < min_rank):
 8912                    min_idx = i
 8913                    min_rank = rank
 8914            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
 8915                break
 8916            assert min_idx is not None
 8917            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
 8918        return parts
 8919
 8920    def set_vocab(self):
 8921        if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
 8922            self.set_vocab_chatglm3()
 8923            return
 8924
 8925        dir_model = self.dir_model
 8926        hparams = self.hparams
 8927        tokens: list[str] = []
 8928        toktypes: list[int] = []
 8929
 8930        from transformers import AutoTokenizer
 8931        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
 8932        vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
 8933        assert max(tokenizer.get_vocab().values()) < vocab_size
 8934
 8935        tokens, toktypes, tokpre = self.get_vocab_base()
 8936        self.gguf_writer.add_tokenizer_model("gpt2")
 8937        self.gguf_writer.add_tokenizer_pre(tokpre)
 8938        self.gguf_writer.add_token_list(tokens)
 8939        self.gguf_writer.add_token_types(toktypes)
 8940        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
 8941        # only add special tokens when they were not already loaded from config.json
 8942        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
 8943        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
 8944        # this one is usually not in config.json anyway
 8945        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
 8946        special_vocab.add_to_gguf(self.gguf_writer)
 8947
 8948    def set_gguf_parameters(self):
 8949        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
 8950        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
 8951        n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
 8952        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
 8953        self.gguf_writer.add_embedding_length(n_embed)
 8954        self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
 8955        self.gguf_writer.add_block_count(self.block_count)
 8956        self.gguf_writer.add_head_count(n_head)
 8957        self.gguf_writer.add_head_count_kv(n_head_kv)
 8958        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
 8959        self.gguf_writer.add_file_type(self.ftype)
 8960        if "attention_dim" in self.hparams:
 8961            rope_dim = self.hparams["attention_dim"]
 8962        else:
 8963            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 8964        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
 8965        self.gguf_writer.add_add_bos_token(False)
 8966        rope_freq = 10000
 8967        if "rope_ratio" in self.hparams:
 8968            rope_freq = rope_freq * self.hparams["rope_ratio"]
 8969        self.gguf_writer.add_rope_freq_base(rope_freq)
 8970
 8971    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 8972        if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
 8973            return
 8974
 8975        name = name.removeprefix("transformer.")
 8976        yield from super().modify_tensors(data_torch, name, bid)
 8977
 8978
 8979@ModelBase.register("NemotronForCausalLM")
 8980class NemotronModel(TextModel):
 8981    model_arch = gguf.MODEL_ARCH.NEMOTRON
 8982
 8983    def set_vocab(self):
 8984        self._set_vocab_sentencepiece()
 8985        self.gguf_writer.add_pad_token_id(0)
 8986        self.gguf_writer.add_unk_token_id(1)
 8987
 8988    def set_gguf_parameters(self):
 8989        super().set_gguf_parameters()
 8990        hparams = self.hparams
 8991        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 8992
 8993        f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
 8994        self.gguf_writer.add_layer_norm_eps(f_norm_eps)
 8995
 8996        # * Partial RoPE
 8997        rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
 8998        n_embd = self.find_hparam(["hidden_size", "n_embd"])
 8999        n_head = self.find_hparam(["num_attention_heads", "n_head"])
 9000        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
 9001
 9002        # * RopeScaling for Nemotron
 9003        if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
 9004            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
 9005        else:
 9006            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
 9007            self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
 9008
 9009    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 9010        # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
 9011        #   model.layers.{l}.input_layernorm.weight
 9012        #   model.layers.{l}.post_attention_layernorm.weight
 9013        #   model.norm.weight
 9014        if name.endswith("norm.weight"):
 9015            data_torch = data_torch + 1
 9016
 9017        yield from super().modify_tensors(data_torch, name, bid)
 9018
 9019
 9020@ModelBase.register("ExaoneForCausalLM")
 9021class ExaoneModel(TextModel):
 9022    model_arch = gguf.MODEL_ARCH.EXAONE
 9023
 9024    def set_gguf_parameters(self):
 9025        super().set_gguf_parameters()
 9026        hparams = self.hparams
 9027
 9028        assert (hparams["activation_function"] == "silu")
 9029
 9030        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
 9031        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
 9032        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
 9033
 9034    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 9035        if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
 9036            if rope_params.get("rope_type", '').lower() == "llama3":
 9037                base = self.rope_parameters.get("rope_theta", 10000.0)
 9038                if (dim := self.hparams.get("head_dim")) is None:
 9039                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 9040                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
 9041
 9042                factor = rope_params.get("factor", 8.0)
 9043                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
 9044                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
 9045                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
 9046
 9047                low_freq_wavelen = old_context_len / low_freq_factor
 9048                high_freq_wavelen = old_context_len / high_freq_factor
 9049                assert low_freq_wavelen != high_freq_wavelen
 9050
 9051                rope_factors = []
 9052                for freq in freqs:
 9053                    wavelen = 2 * math.pi / freq
 9054                    if wavelen < high_freq_wavelen:
 9055                        rope_factors.append(1)
 9056                    elif wavelen > low_freq_wavelen:
 9057                        rope_factors.append(factor)
 9058                    else:
 9059                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
 9060                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
 9061
 9062                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
 9063
 9064
 9065@ModelBase.register("Exaone4ForCausalLM")
 9066class Exaone4Model(TextModel):
 9067    model_arch = gguf.MODEL_ARCH.EXAONE4
 9068
 9069    def set_vocab(self):
 9070        tokens, toktypes, tokpre = self.get_vocab_base()
 9071        self.gguf_writer.add_tokenizer_model("gpt2")
 9072        self.gguf_writer.add_tokenizer_pre(tokpre)
 9073        self.gguf_writer.add_token_list(tokens)
 9074        self.gguf_writer.add_token_types(toktypes)
 9075
 9076        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
 9077        special_vocab.add_to_gguf(self.gguf_writer)
 9078
 9079    def set_gguf_parameters(self):
 9080        super().set_gguf_parameters()
 9081        hparams = self.hparams
 9082        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 9083
 9084        if hparams.get("sliding_window") is not None:
 9085            self.gguf_writer.add_sliding_window(hparams["sliding_window"])
 9086            if "layer_types" in hparams:
 9087                self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
 9088            elif "sliding_window_pattern" in hparams:
 9089                sliding_window_pattern = []
 9090                if isinstance(hparams["sliding_window_pattern"], str):  # e.g. LLLG
 9091                    for i in range(hparams["num_hidden_layers"]):
 9092                        sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L")
 9093                if isinstance(hparams["sliding_window_pattern"], int):  # e.g. 4
 9094                    for i in range(hparams["num_hidden_layers"]):
 9095                        sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0)
 9096                if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
 9097                    self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
 9098
 9099    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
 9100        if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
 9101            if rope_params.get("rope_type", '').lower() == "llama3":
 9102                base = rope_params.get("rope_theta", 10_000.0)
 9103                if (dim := self.hparams.get("head_dim")) is None:
 9104                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 9105                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
 9106
 9107                factor = rope_params.get("factor", 16.0)
 9108                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
 9109                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
 9110                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
 9111
 9112                low_freq_wavelen = old_context_len / low_freq_factor
 9113                high_freq_wavelen = old_context_len / high_freq_factor
 9114
 9115                rope_factors = []
 9116                for freq in freqs:
 9117                    wavelen = 2 * math.pi / freq
 9118                    if wavelen < high_freq_wavelen:
 9119                        rope_factors.append(1)
 9120                    elif wavelen > low_freq_wavelen:
 9121                        rope_factors.append(factor)
 9122                    else:
 9123                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
 9124                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
 9125
 9126                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
 9127
 9128
 9129@ModelBase.register("ExaoneMoEForCausalLM")
 9130class ExaoneMoEModel(Exaone4Model):
 9131    model_arch = gguf.MODEL_ARCH.EXAONE_MOE
 9132
 9133    def __init__(self, *args, **kwargs):
 9134        super().__init__(*args, **kwargs)
 9135        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
 9136        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
 9137
 9138    def set_gguf_parameters(self):
 9139        super().set_gguf_parameters()
 9140        self.gguf_writer.add_expert_count(self.hparams["num_experts"])
 9141        moe_intermediate_size = self.hparams["moe_intermediate_size"]
 9142        num_shared_experts = self.hparams["num_shared_experts"]
 9143        self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
 9144        self.gguf_writer.add_expert_shared_count(num_shared_experts)
 9145        self.gguf_writer.add_expert_shared_feed_forward_length(moe_intermediate_size * num_shared_experts)
 9146        self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
 9147        self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
 9148        n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0))
 9149        self.gguf_writer.add_leading_dense_block_count(n_dense_layer)
 9150        self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0))
 9151
 9152        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
 9153
 9154    _experts: list[dict[str, Tensor]] | None = None
 9155
 9156    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 9157        if name.startswith("mtp."):
 9158            if name.find("layers.") != -1:
 9159                # `mtp.layers.0.[module_name]` format
 9160                name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + self.hparams['num_hidden_layers']}")
 9161            else:
 9162                # mtp fc/norm weights
 9163                remapper = {
 9164                    "mtp.fc": "model.layers.{bid}.eh_proj",
 9165                    "mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm",
 9166                    "mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm",
 9167                    "mtp.norm": "model.layers.{bid}.shared_head.norm",
 9168                }
 9169                _n = Path(name)
 9170                new_name = remapper[_n.stem] + _n.suffix
 9171
 9172                # set shared weights for all NextN/MTP layers
 9173                for bid in range(self.hparams['num_hidden_layers'], self.block_count):
 9174                    yield from super().modify_tensors(data_torch, new_name.format(bid=bid), bid)
 9175                return
 9176
 9177        if name.endswith("e_score_correction_bias"):
 9178            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 9179
 9180        if name.find("mlp.experts") != -1:
 9181            n_experts = self.hparams["num_experts"]
 9182            assert bid is not None
 9183
 9184            if self._experts is None:
 9185                self._experts = [{} for _ in range(self.block_count)]
 9186
 9187            self._experts[bid][name] = data_torch
 9188
 9189            if len(self._experts[bid]) >= n_experts * 3:
 9190                # merge the experts into a single 3d tensor
 9191                for w_name in ["down_proj", "gate_proj", "up_proj"]:
 9192                    datas: list[Tensor] = []
 9193
 9194                    for xid in range(n_experts):
 9195                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 9196                        datas.append(self._experts[bid][ename])
 9197                        del self._experts[bid][ename]
 9198
 9199                    data_torch = torch.stack(datas, dim=0)
 9200
 9201                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 9202
 9203                    new_name = self.map_tensor_name(merged_name)
 9204
 9205                    yield from super().modify_tensors(data_torch, new_name, bid)
 9206                return
 9207            else:
 9208                return
 9209
 9210        yield from super().modify_tensors(data_torch, name, bid)
 9211
 9212    def prepare_tensors(self):
 9213        super().prepare_tensors()
 9214        if self._experts is not None:
 9215            # flatten `list[dict[str, Tensor]]` into `list[str]`
 9216            experts = [k for d in self._experts for k in d.keys()]
 9217            if len(experts) > 0:
 9218                raise ValueError(f"Unprocessed experts: {experts}")
 9219
 9220
 9221@ModelBase.register("GraniteForCausalLM")
 9222class GraniteModel(LlamaModel):
 9223    """Conversion for IBM's GraniteForCausalLM"""
 9224    model_arch = gguf.MODEL_ARCH.GRANITE
 9225
 9226    def set_gguf_parameters(self):
 9227        """Granite uses standard llama parameters with the following differences:
 9228
 9229        - No head_dim support
 9230        - New multiplier params:
 9231            - attention_scale
 9232            - embedding_scale
 9233            - residual_scale
 9234        - logits_scaling
 9235        """
 9236        if head_dim := self.hparams.pop("head_dim", None):
 9237            logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
 9238        super().set_gguf_parameters()
 9239        # NOTE: Convert _multiplier params to _scale params for naming
 9240        #   consistency
 9241        if attention_scale := self.hparams.get("attention_multiplier"):
 9242            self.gguf_writer.add_attention_scale(attention_scale)
 9243            logger.info("gguf: (granite) attention_scale = %s", attention_scale)
 9244        if embedding_scale := self.hparams.get("embedding_multiplier"):
 9245            self.gguf_writer.add_embedding_scale(embedding_scale)
 9246            logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
 9247        if residual_scale := self.hparams.get("residual_multiplier"):
 9248            self.gguf_writer.add_residual_scale(residual_scale)
 9249            logger.info("gguf: (granite) residual_scale = %s", residual_scale)
 9250        if logits_scale := self.hparams.get("logits_scaling"):
 9251            self.gguf_writer.add_logit_scale(logits_scale)
 9252            logger.info("gguf: (granite) logits_scale = %s", logits_scale)
 9253
 9254
 9255@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
 9256class GraniteMoeModel(GraniteModel):
 9257    """Conversion for IBM's GraniteMoeForCausalLM"""
 9258    model_arch = gguf.MODEL_ARCH.GRANITE_MOE
 9259
 9260    def set_gguf_parameters(self):
 9261        """GraniteMoeShared uses GraniteMoe parameters plus the following:
 9262        - shared_intermediate_size
 9263        """
 9264        super().set_gguf_parameters()
 9265        if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"):
 9266            self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length)
 9267            logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length)
 9268
 9269    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 9270        """In modeling_granitemoe, the JetMoe implementation of parallel experts
 9271        is used. This essentially merges w1 and w3 into a single tensor with 2x
 9272        the hidden size that is then split during forward. To keep compatibility
 9273        with existing mixtral support, we pull them apart here.
 9274        """
 9275
 9276        if name.endswith("block_sparse_moe.input_linear.weight"):
 9277            ffn_dim = self.hparams["intermediate_size"]
 9278            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
 9279            gate, up = data_torch.split(ffn_dim, dim=-2)
 9280            yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid)
 9281            yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid)
 9282            return
 9283
 9284        has_experts = bool(self.hparams.get('num_local_experts'))
 9285
 9286        if name.endswith("shared_mlp.input_linear.weight"):
 9287            ffn_dim = self.hparams["shared_intermediate_size"]
 9288            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
 9289            gate, up = data_torch.split(ffn_dim, dim=-2)
 9290            if has_experts:
 9291                yield from ModelBase.modify_tensors(self, gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid)
 9292                yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid)
 9293                return
 9294            yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid)
 9295            yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid)
 9296            return
 9297
 9298        if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
 9299            yield from ModelBase.modify_tensors(self, data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid)
 9300            return
 9301
 9302        yield from super().modify_tensors(data_torch, name, bid)
 9303
 9304
 9305@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
 9306class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
 9307    """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
 9308    layers and optionally uses MoE w/ a shared expert"""
 9309    model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
 9310    undo_permute = True
 9311
 9312    def __init__(self, *args, **kwargs):
 9313
 9314        # Hybrid mamba models use a prefix for the mamba-specific params.
 9315        # TODO: Extend this if the prefix(es) need to be configurable
 9316        self.hparam_prefixes = ["mamba"]
 9317
 9318        super().__init__(*args, **kwargs)
 9319
 9320        # Lists of which layers use ssm vs attention
 9321        self._attn_layers = self.get_attn_layers()
 9322        self._ssm_layers = [
 9323            i for i in range(self.block_count)
 9324            if i not in self._attn_layers
 9325        ]
 9326
 9327        # There are some models in this family that are non-hybrid, but keep the
 9328        # same parent class by setting all layers to "attention." If this is the
 9329        # case, the model architecture needs to be updated to a standard
 9330        # "granite" or "granitemoe" model
 9331        if not self._ssm_layers:
 9332            has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
 9333            new_arch = (
 9334                gguf.MODEL_ARCH.GRANITE_MOE
 9335                if has_experts else
 9336                gguf.MODEL_ARCH.GRANITE
 9337            )
 9338            self.model_arch = new_arch
 9339            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
 9340            self.gguf_writer.add_architecture()
 9341
 9342        # n_group and d_inner are used during reshape_tensors for mamba2
 9343        # NOTE: Explicitly include hparam prefix prefix for d_model to
 9344        #   disambiguate with top-level head_dim
 9345        # NOTE 2: If needed for future models, this can be isolated in a method
 9346        #   to separate the prefix setting and teh keys used
 9347        self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"])
 9348        self.n_group = self.find_hparam(["n_groups", "num_groups"])
 9349        self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model
 9350
 9351    def get_attn_layers(self):
 9352        # Explicit list of layer type names
 9353        if layer_types := self.hparams.get("layer_types"):
 9354            return [
 9355                i for i, typ in enumerate(layer_types)
 9356                if typ == "attention"
 9357            ]
 9358
 9359        # Layer types indicated by index or period
 9360        attn_layers = self.hparams.get("attn_layer_indices", [])
 9361        if not attn_layers:
 9362            attn_period = self.hparams.get("attn_layer_period")
 9363            assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
 9364            attn_offset = self.hparams.get("attn_layer_offset")
 9365            assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
 9366            attn_layers = [
 9367                i for i in range(self.block_count)
 9368                if i % attn_period == attn_offset
 9369            ]
 9370        return attn_layers
 9371
 9372    def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
 9373        prefixed = []
 9374        for pfx in self.hparam_prefixes:
 9375            prefixed.extend(
 9376                "_".join([pfx, k])
 9377                for k in keys
 9378            )
 9379        keys = list(keys) + prefixed
 9380        return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
 9381
 9382    def modify_tensors(
 9383        self, data_torch: Tensor, name: str, bid: int | None
 9384    ) -> Iterable[tuple[str, Tensor]]:
 9385        if (
 9386            name.endswith("block_sparse_moe.input_linear.weight")
 9387            or "shared_mlp" in name
 9388        ):
 9389            yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
 9390            return
 9391
 9392        # Determine whether this is a mamba layer or an attention layer
 9393        if bid in self._ssm_layers:
 9394            yield from Mamba2Model.modify_tensors(self, data_torch, name, bid)
 9395            return
 9396        elif bid in self._attn_layers:
 9397            yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
 9398            return
 9399        yield from ModelBase.modify_tensors(self, data_torch, name, bid)
 9400
 9401    def set_gguf_parameters(self):
 9402        """This method merges params from both parents and some that are
 9403        specific to this model. The result is some duplication of how the params
 9404        get set. The following warnings are expected during conversion:
 9405
 9406        WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
 9407        WARNING:Duplicated key name 'granitehybrid.context_length'
 9408        """
 9409        GraniteMoeModel.set_gguf_parameters(self)
 9410
 9411        ## Mamba mixer params ##
 9412        self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
 9413        self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state", "state_dim", "ssm_state_size"]))
 9414        self.gguf_writer.add_ssm_group_count(self.n_group)
 9415        self.gguf_writer.add_ssm_inner_size(self.d_inner)
 9416        # NOTE: The mamba_dt_rank is _not_ the right field for how this is used
 9417        #   in llama.cpp
 9418        self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads", "num_heads"]))
 9419
 9420        ## Attention params ##
 9421        head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
 9422        head_count_kv_vec = [
 9423            head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
 9424        ]
 9425        if rope_dim := self.hparams.get("attn_rotary_emb"):
 9426            self.gguf_writer.add_rope_dimension_count(rope_dim)
 9427        self.gguf_writer.add_head_count_kv(head_count_kv_vec)
 9428
 9429        ## If Bamba or non-hybrid, use rope, otherwise don't
 9430        use_rope = (
 9431            "BambaForCausalLM" in self.hparams["architectures"]
 9432            or not self._ssm_layers
 9433        )
 9434        self.gguf_writer.add_rope_scaling_finetuned(use_rope)
 9435        if not use_rope:
 9436            self.gguf_writer.add_context_length(2**20)
 9437
 9438        ## Validation ##
 9439        d_head = self.find_hparam(["d_head"], optional=True) or 64
 9440        assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
 9441        assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
 9442
 9443    def set_vocab(self):
 9444        self.hparams["pad_vocab_size_multiple"] = 8
 9445        Mamba2Model.set_vocab(self)
 9446
 9447
 9448@ModelBase.register("NemotronHForCausalLM")
 9449class NemotronHModel(GraniteHybridModel):
 9450    """Hybrid mamba2/attention model from NVIDIA"""
 9451    model_arch = gguf.MODEL_ARCH.NEMOTRON_H
 9452    is_moe: bool = False
 9453
 9454    def __init__(self, *args, **kwargs):
 9455        # We have to determine the correct model architecture (MoE vs non-MoE) before
 9456        # calling the parent __init__. This is because the parent constructor
 9457        # uses self.model_arch to build the tensor name map, and all MoE-specific
 9458        # mappings would be missed if it were called with the default non-MoE arch.
 9459        hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
 9460        if "num_experts_per_tok" in hparams:
 9461            self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
 9462            self.is_moe = True
 9463
 9464        super().__init__(*args, **kwargs)
 9465
 9466        # Save the top-level head_dim for later
 9467        self.head_dim = self.hparams.get("head_dim", self.hparams.get("attention_head_dim"))
 9468        assert self.head_dim is not None, "Could not find the attention head dim in config"
 9469
 9470        # Don't use expand to calculate d_inner
 9471        self.d_inner = self.find_hparam(["num_heads"]) * self.d_model
 9472
 9473        # Update the ssm / attn / mlp layers
 9474        # M: Mamba2, *: Attention, -: MLP
 9475        # MoE:
 9476        # M: Mamba2, *: Attention, E: Expert
 9477        hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
 9478        self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
 9479        self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
 9480
 9481    def get_attn_layers(self):
 9482        hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
 9483        assert len(hybrid_override_pattern) == self.block_count, "Mismatch between hybrid override and num_hidden_layers!"
 9484        return [i for i, val in enumerate(hybrid_override_pattern) if val == "*"]
 9485
 9486    def set_gguf_parameters(self):
 9487        super().set_gguf_parameters()
 9488
 9489        self.gguf_writer.add_key_length(self.head_dim)
 9490        self.gguf_writer.add_value_length(self.head_dim)
 9491
 9492        # Set feed_forward_length
 9493        # NOTE: This will trigger an override warning. This is preferrable to
 9494        #   duplicating all the parent logic
 9495        if not self.is_moe:
 9496            n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
 9497            self.gguf_writer.add_feed_forward_length([
 9498                n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
 9499            ])
 9500        else:
 9501            moe_intermediate_size = self.hparams["moe_intermediate_size"]
 9502            self.gguf_writer.add_feed_forward_length([
 9503                moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count)
 9504            ])
 9505            self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
 9506            self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
 9507            self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"])
 9508            self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
 9509            self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
 9510            self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
 9511            self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
 9512            self.gguf_writer.add_expert_group_count(self.hparams["n_group"])
 9513
 9514            # number of experts used per token (top-k)
 9515            if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
 9516                self.gguf_writer.add_expert_used_count(n_experts_used)
 9517
 9518    def set_vocab(self):
 9519        super().set_vocab()
 9520
 9521        # The tokenizer _does_ add a BOS token (via post_processor type
 9522        # TemplateProcessing) but does not set add_bos_token to true in the
 9523        # config, so we need to explicitly override it here.
 9524        if not self.is_moe:
 9525            self.gguf_writer.add_add_bos_token(True)
 9526
 9527    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 9528        if self.is_moe and bid is not None:
 9529            if name.endswith("mixer.gate.e_score_correction_bias"):
 9530                new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 9531                yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
 9532                return
 9533
 9534            if name.endswith("mixer.dt_bias"):
 9535                new_name = name.replace("dt_bias", "dt.bias")
 9536                yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
 9537                return
 9538
 9539            if name.endswith("mixer.conv1d.weight"):
 9540                squeezed_data = data_torch.squeeze()
 9541                yield from ModelBase.modify_tensors(self, squeezed_data, name, bid)
 9542                return
 9543
 9544            if name.endswith("mixer.A_log"):
 9545                transformed_data = -torch.exp(data_torch)
 9546                reshaped_data = transformed_data.squeeze().reshape(-1, 1)
 9547                yield from ModelBase.modify_tensors(self, reshaped_data, name, bid)
 9548                return
 9549
 9550            if name.endswith("mixer.D"):
 9551                reshaped_data = data_torch.squeeze().reshape(-1, 1)
 9552                yield from ModelBase.modify_tensors(self, reshaped_data, name, bid)
 9553                return
 9554
 9555            if name.endswith("mixer.norm.weight"):
 9556                reshaped_data = data_torch.reshape(self.n_group, -1)
 9557                yield from ModelBase.modify_tensors(self, reshaped_data, name, bid)
 9558                return
 9559
 9560            if name.find("mixer.experts") != -1:
 9561                n_experts = self.hparams["n_routed_experts"]
 9562                assert bid is not None
 9563
 9564                if self._experts is None:
 9565                    self._experts = [{} for _ in range(self.block_count)]
 9566
 9567                self._experts[bid][name] = data_torch
 9568
 9569                if len(self._experts[bid]) >= n_experts * 2:
 9570                    # merge the experts into a single tensor
 9571                    for w_name in ["down_proj", "up_proj"]:
 9572                        datas: list[Tensor] = []
 9573
 9574                        for xid in range(n_experts):
 9575                            ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight"
 9576                            datas.append(self._experts[bid][ename])
 9577                            del self._experts[bid][ename]
 9578
 9579                        data_torch = torch.stack(datas, dim=0)
 9580                        merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 9581
 9582                        yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid)
 9583                    return
 9584                else:
 9585                    return
 9586
 9587        yield from super().modify_tensors(data_torch, name, bid)
 9588
 9589    def prepare_tensors(self):
 9590        super().prepare_tensors()
 9591
 9592        if self._experts is not None:
 9593            # flatten `list[dict[str, Tensor]]` into `list[str]`
 9594            experts = [k for d in self._experts for k in d.keys()]
 9595            if len(experts) > 0:
 9596                raise ValueError(f"Unprocessed experts: {experts}")
 9597
 9598
 9599@ModelBase.register("LlamaBidirectionalModel")
 9600class LlamaEmbedNemotronModel(LlamaModel):
 9601    model_arch = gguf.MODEL_ARCH.LLAMA_EMBED
 9602
 9603
 9604@ModelBase.register("BailingMoeForCausalLM")
 9605class BailingMoeModel(TextModel):
 9606    model_arch = gguf.MODEL_ARCH.BAILINGMOE
 9607
 9608    def set_vocab(self):
 9609        self._set_vocab_gpt2()
 9610
 9611    def set_gguf_parameters(self):
 9612        super().set_gguf_parameters()
 9613        hparams = self.hparams
 9614        if (rope_dim := hparams.get("head_dim")) is None:
 9615            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
 9616
 9617        self.gguf_writer.add_rope_dimension_count(rope_dim)
 9618        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
 9619        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 9620        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
 9621        self.gguf_writer.add_expert_weights_scale(1.0)
 9622        self.gguf_writer.add_expert_count(hparams["num_experts"])
 9623        self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
 9624        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
 9625
 9626    _experts: list[dict[str, Tensor]] | None = None
 9627
 9628    @staticmethod
 9629    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
 9630        if n_head_kv is not None and n_head != n_head_kv:
 9631            n_head = n_head_kv
 9632        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
 9633                .swapaxes(1, 2)
 9634                .reshape(weights.shape))
 9635
 9636    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 9637        n_head = self.hparams["num_attention_heads"]
 9638        n_kv_head = self.hparams.get("num_key_value_heads")
 9639        n_embd = self.hparams["hidden_size"]
 9640        if (head_dim := self.hparams.get("head_dim")) is None:
 9641            head_dim = n_embd // n_head
 9642
 9643        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
 9644
 9645        if name.endswith("attention.dense.weight"):
 9646            yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid)
 9647            return
 9648        elif name.endswith("query_key_value.weight"):
 9649            q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
 9650
 9651            yield from super().modify_tensors(BailingMoeModel.permute(q, n_head, n_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid)
 9652            yield from super().modify_tensors(BailingMoeModel.permute(k, n_head, n_kv_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid)
 9653            yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid)
 9654            return
 9655        elif name.find("mlp.experts") != -1:
 9656            n_experts = self.hparams["num_experts"]
 9657            assert bid is not None
 9658
 9659            if self._experts is None:
 9660                self._experts = [{} for _ in range(self.block_count)]
 9661
 9662            self._experts[bid][name] = data_torch
 9663
 9664            if len(self._experts[bid]) >= n_experts * 3:
 9665                # merge the experts into a single 3d tensor
 9666                for w_name in ["down_proj", "gate_proj", "up_proj"]:
 9667                    datas: list[Tensor] = []
 9668
 9669                    for xid in range(n_experts):
 9670                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 9671                        datas.append(self._experts[bid][ename])
 9672                        del self._experts[bid][ename]
 9673
 9674                    data_torch = torch.stack(datas, dim=0)
 9675
 9676                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 9677
 9678                    new_name = self.map_tensor_name(merged_name)
 9679
 9680                    yield from super().modify_tensors(data_torch, new_name, bid)
 9681
 9682            return
 9683
 9684        new_name = self.map_tensor_name(name)
 9685
 9686        if new_name == output_name and self.hparams.get("norm_head"):
 9687            data_torch = data_torch.float()
 9688            data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
 9689
 9690        yield from super().modify_tensors(data_torch, new_name, bid)
 9691
 9692    def prepare_tensors(self):
 9693        super().prepare_tensors()
 9694
 9695        if self._experts is not None:
 9696            # flatten `list[dict[str, Tensor]]` into `list[str]`
 9697            experts = [k for d in self._experts for k in d.keys()]
 9698            if len(experts) > 0:
 9699                raise ValueError(f"Unprocessed experts: {experts}")
 9700
 9701
 9702@ModelBase.register("BailingMoeV2ForCausalLM")
 9703class BailingMoeV2Model(TextModel):
 9704    model_arch = gguf.MODEL_ARCH.BAILINGMOE2
 9705
 9706    def __init__(self, *args, **kwargs):
 9707        super().__init__(*args, **kwargs)
 9708        if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0):
 9709            self.block_count = self.hparams["num_hidden_layers"] + nextn_layers
 9710            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
 9711
 9712    def set_vocab(self):
 9713        self._set_vocab_gpt2()
 9714
 9715    def set_gguf_parameters(self):
 9716        super().set_gguf_parameters()
 9717        hparams = self.hparams
 9718        if (rope_dim := hparams.get("head_dim")) is None:
 9719            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
 9720
 9721        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
 9722        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
 9723        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 9724        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
 9725        self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"]))
 9726        self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
 9727        self.gguf_writer.add_expert_count(hparams["num_experts"])
 9728        self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
 9729        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
 9730
 9731        if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
 9732            self.gguf_writer.add_nextn_predict_layers(nextn_layers)
 9733
 9734    _experts: list[dict[str, Tensor]] | None = None
 9735
 9736    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 9737        if "mlp.experts" in name:
 9738            n_experts = self.hparams["num_experts"]
 9739            assert bid is not None
 9740
 9741            if self._experts is None:
 9742                self._experts = [{} for _ in range(self.block_count)]
 9743
 9744            self._experts[bid][name] = data_torch
 9745
 9746            if len(self._experts[bid]) >= n_experts * 3:
 9747                # merge the experts into a single 3d tensor
 9748                for w_name in ["down_proj", "gate_proj", "up_proj"]:
 9749                    datas: list[Tensor] = []
 9750
 9751                    for xid in range(n_experts):
 9752                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 9753                        datas.append(self._experts[bid][ename])
 9754                        del self._experts[bid][ename]
 9755
 9756                    data_torch = torch.stack(datas, dim=0)
 9757
 9758                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 9759
 9760                    yield from super().modify_tensors(data_torch, merged_name, bid)
 9761            return
 9762
 9763        if name.endswith(".expert_bias"):
 9764            name = name.replace(".expert_bias", ".expert_bias.bias")
 9765
 9766        yield from super().modify_tensors(data_torch, name, bid)
 9767
 9768    def prepare_tensors(self):
 9769        super().prepare_tensors()
 9770
 9771        if self._experts is not None:
 9772            # flatten `list[dict[str, Tensor]]` into `list[str]`
 9773            experts = [k for d in self._experts for k in d.keys()]
 9774            if len(experts) > 0:
 9775                raise ValueError(f"Unprocessed experts: {experts}")
 9776
 9777
 9778@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
 9779class GroveMoeModel(TextModel):
 9780    model_arch = gguf.MODEL_ARCH.GROVEMOE
 9781
 9782    def set_gguf_parameters(self):
 9783        super().set_gguf_parameters()
 9784        if (n_experts := self.hparams.get("num_experts")) is not None:
 9785            self.gguf_writer.add_expert_count(n_experts)
 9786        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
 9787            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
 9788            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
 9789        # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
 9790        self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128)
 9791        # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
 9792        self.gguf_writer.add_experts_per_group(2)
 9793        # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
 9794        self.gguf_writer.add_expert_group_scale(0.05)
 9795
 9796    _experts: list[dict[str, Tensor]] | None = None
 9797    _chunk_experts: list[dict[str, Tensor]] | None = None
 9798
 9799    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 9800        if name.endswith(".expert_bias"):
 9801            # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
 9802            return
 9803
 9804        # process the experts separately
 9805        if name.find("chunk_experts") != -1:
 9806            n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
 9807            assert bid is not None
 9808
 9809            if self._chunk_experts is None:
 9810                self._chunk_experts = [{} for _ in range(self.block_count)]
 9811
 9812            self._chunk_experts[bid][name] = data_torch
 9813
 9814            if len(self._chunk_experts[bid]) >= n_experts * 3:
 9815                # merge the experts into a single 3d tensor
 9816                for w_name in ["down_proj", "gate_proj", "up_proj"]:
 9817                    datas: list[Tensor] = []
 9818
 9819                    for xid in range(n_experts):
 9820                        ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight"
 9821                        datas.append(self._chunk_experts[bid][ename])
 9822                        del self._chunk_experts[bid][ename]
 9823
 9824                    data_torch = torch.stack(datas, dim=0)
 9825
 9826                    merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight"
 9827
 9828                    yield from super().modify_tensors(data_torch, merged_name, bid)
 9829                return
 9830            else:
 9831                return
 9832        elif name.find("experts") != -1:
 9833            n_experts = self.hparams["num_experts"]
 9834            assert bid is not None
 9835
 9836            if self._experts is None:
 9837                self._experts = [{} for _ in range(self.block_count)]
 9838
 9839            self._experts[bid][name] = data_torch
 9840
 9841            if len(self._experts[bid]) >= n_experts * 3:
 9842                # merge the experts into a single 3d tensor
 9843                for w_name in ["down_proj", "gate_proj", "up_proj"]:
 9844                    datas: list[Tensor] = []
 9845
 9846                    for xid in range(n_experts):
 9847                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
 9848                        datas.append(self._experts[bid][ename])
 9849                        del self._experts[bid][ename]
 9850
 9851                    data_torch = torch.stack(datas, dim=0)
 9852
 9853                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
 9854
 9855                    yield from super().modify_tensors(data_torch, merged_name, bid)
 9856                return
 9857            else:
 9858                return
 9859
 9860        yield from super().modify_tensors(data_torch, name, bid)
 9861
 9862    def prepare_tensors(self):
 9863        super().prepare_tensors()
 9864
 9865        if self._chunk_experts is not None:
 9866            # flatten `list[dict[str, Tensor]]` into `list[str]`
 9867            chunk_experts = [k for d in self._chunk_experts for k in d.keys()]
 9868            if len(chunk_experts) > 0:
 9869                raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}")
 9870
 9871        if self._experts is not None:
 9872            # flatten `list[dict[str, Tensor]]` into `list[str]`
 9873            experts = [k for d in self._experts for k in d.keys()]
 9874            if len(experts) > 0:
 9875                raise ValueError(f"Unprocessed experts: {experts}")
 9876
 9877
 9878@ModelBase.register("ChameleonForConditionalGeneration")
 9879@ModelBase.register("ChameleonForCausalLM")  # obsolete
 9880class ChameleonModel(TextModel):
 9881    model_arch = gguf.MODEL_ARCH.CHAMELEON
 9882
 9883    def set_gguf_parameters(self):
 9884        super().set_gguf_parameters()
 9885        self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
 9886
 9887    def set_vocab(self):
 9888        self._set_vocab_gpt2()
 9889
 9890    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 9891        # ignore image tokenizer for now
 9892        # TODO: remove this once image support is implemented for Chameleon
 9893        if name.startswith("model.vqmodel"):
 9894            return
 9895
 9896        n_head = self.hparams["num_attention_heads"]
 9897        n_kv_head = self.hparams.get("num_key_value_heads")
 9898        hidden_dim = self.hparams.get("hidden_size")
 9899
 9900        if name.endswith(("q_proj.weight", "q_proj.bias")):
 9901            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
 9902        if name.endswith(("k_proj.weight", "k_proj.bias")):
 9903            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 9904        if name.endswith(("q_norm.weight", "q_norm.bias")):
 9905            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
 9906        if name.endswith(("k_norm.weight", "k_norm.bias")):
 9907            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
 9908
 9909        yield from super().modify_tensors(data_torch, name, bid)
 9910
 9911    # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
 9912    @staticmethod
 9913    def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
 9914        head_dim = hidden_dim // n_heads
 9915        data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
 9916        data_torch = data_torch.repeat_interleave(n_heads, 0)
 9917        return data_torch
 9918
 9919
 9920@ModelBase.register("UltravoxModel")
 9921class UltravoxModel(TextModel):
 9922    model_arch = gguf.MODEL_ARCH.LLAMA # dummy
 9923
 9924    def __init__(self, *args, **kwargs):
 9925        super().__init__(*args, **kwargs)
 9926        raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
 9927
 9928
 9929@ModelBase.register("GlmasrModel")
 9930class GlmASRWhisperEncoderModel(MmprojModel):
 9931    has_vision_encoder = False
 9932    has_audio_encoder = True
 9933
 9934    def __init__(self, *args, **kwargs):
 9935        super().__init__(*args, **kwargs)
 9936        if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
 9937            self.hparams["hidden_size"] = self.hparams["d_model"]
 9938            self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
 9939            self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
 9940
 9941    def set_gguf_parameters(self):
 9942        super().set_gguf_parameters()
 9943        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA)
 9944        self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
 9945        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
 9946        self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"])
 9947
 9948    def tensor_force_quant(self, name, new_name, bid, n_dims):
 9949        if ".conv" in name and ".weight" in name:
 9950            return gguf.GGMLQuantizationType.F16
 9951        return super().tensor_force_quant(name, new_name, bid, n_dims)
 9952
 9953    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
 9954        if name.startswith("model.") or name.startswith("lm_head."):
 9955            # skip language model tensors
 9956            return
 9957
 9958        if name.startswith("audio_encoder.whisper."):
 9959            name = name.replace("audio_encoder.whisper.","audio_tower.")
 9960        if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name:
 9961            name = name.replace("audio_encoder.", "audio_encoder.adapting.")
 9962
 9963        if name.startswith("audio_encoder.audio_bos_eos_token."):
 9964            yield from super().modify_tensors(data_torch[0], "model.vision.boi", bid)
 9965            yield from super().modify_tensors(data_torch[1], "model.vision.eoi", bid)
 9966            return
 9967
 9968        if name.startswith("audio_encoder.adapting."):
 9969            name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.")
 9970            if ".layer_norm." in name:
 9971                name = name.replace(".layer_norm.", ".ln_pre.")
 9972            if ".0." in name:
 9973                name = name.replace(".0.", ".linear_1.")
 9974            if ".2." in name:
 9975                name = name.replace(".2.", ".linear_2.")
 9976            if ".proj." in name:
 9977                return
 9978
 9979        if "conv1.bias" in name or "conv2.bias" in name:
 9980            # transpose conv1 and conv2 bias
 9981            data_torch = data_torch.unsqueeze(-1)
 9982
 9983        yield from super().modify_tensors(data_torch, name, bid)
 9984
 9985
 9986@ModelBase.register("Qwen2AudioForConditionalGeneration")
 9987class WhisperEncoderModel(MmprojModel):
 9988    has_vision_encoder = False # no vision encoder
 9989    has_audio_encoder = True
 9990
 9991    def __init__(self, *args, **kwargs):
 9992        super().__init__(*args, **kwargs)
 9993        if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
 9994            self.hparams["hidden_size"] = self.hparams["d_model"]
 9995            self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
 9996            self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
 9997
 9998    def set_gguf_parameters(self):
 9999        super().set_gguf_parameters()
10000        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A)
10001        self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
10002        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
10003
10004    def tensor_force_quant(self, name, new_name, bid, n_dims):
10005        if ".conv" in name and ".weight" in name:
10006            return gguf.GGMLQuantizationType.F16
10007        return super().tensor_force_quant(name, new_name, bid, n_dims)
10008
10009    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10010        if name.startswith("language_model."):
10011            # skip language model tensors
10012            return
10013
10014        # prevent clash naming with vision tensors
10015        if name.startswith("multi_modal_projector"):
10016            name = "audio." + name
10017
10018        if "conv1.bias" in name or "conv2.bias" in name:
10019            # transpose conv1 and conv2 bias
10020            data_torch = data_torch.unsqueeze(-1)
10021
10022        yield from super().modify_tensors(data_torch, name, bid)
10023
10024
10025@ModelBase.register("UltravoxModel")
10026class UltravoxWhisperEncoderModel(WhisperEncoderModel):
10027    has_vision_encoder = False # no vision encoder
10028    has_audio_encoder = True
10029
10030    def set_gguf_parameters(self):
10031        super().set_gguf_parameters()
10032        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
10033        self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
10034
10035
10036@ModelBase.register("VoxtralForConditionalGeneration")
10037class VoxtralWhisperEncoderModel(WhisperEncoderModel):
10038    has_vision_encoder = False # no vision encoder
10039    has_audio_encoder = True
10040
10041    def set_gguf_parameters(self):
10042        super().set_gguf_parameters()
10043        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL)
10044        self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
10045
10046
10047@ModelBase.register("AudioFlamingo3ForConditionalGeneration")
10048class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel):
10049    def set_gguf_parameters(self):
10050        super().set_gguf_parameters()
10051        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO)
10052
10053    def tensor_force_quant(self, name, new_name, bid, n_dims):
10054        if ".conv" in name and ".weight" in name:
10055            # Was trained in BF16, being safe, avoiding quantizing to FP16
10056            return gguf.GGMLQuantizationType.F32
10057        return super().tensor_force_quant(name, new_name, bid, n_dims)
10058
10059
10060@ModelBase.register("FalconH1ForCausalLM")
10061class FalconH1Model(Mamba2Model):
10062    model_arch = gguf.MODEL_ARCH.FALCON_H1
10063
10064    def __init__(self, *args, **kwargs):
10065        # Set the hparam prefixes for Falcon Mamba2
10066        self.hparam_prefixes = ["mamba"]
10067
10068        # Initialize the base Mamba2Model
10069        super().__init__(*args, **kwargs)
10070
10071        # Use Llama conversion for attention
10072        self._transformer_model_class = LlamaModel
10073
10074        # n_group and d_inner are used during reshape_tensors for mamba2
10075        self.n_group = self.find_hparam(["n_groups"])
10076        self.d_inner = self.find_hparam(["mamba_d_ssm"])
10077        self.d_head = self.find_hparam(["d_head"])
10078
10079        # Initialize any Falcon Mamba2 specific attributes
10080        self.has_attention = True  # Falcon Mamba2 has attention components
10081
10082        # Load Falcon-H1 multipliers from hyperparameters
10083        self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
10084        self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
10085        self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
10086        self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
10087        self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
10088        self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
10089        self.intermediate_size = self.find_hparam(["intermediate_size"])
10090        self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
10091
10092    def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
10093        prefixed = []
10094        for pfx in self.hparam_prefixes:
10095            prefixed.extend(
10096                "_".join([pfx, k])
10097                for k in keys
10098            )
10099        keys = list(keys) + prefixed
10100        return super().find_hparam(keys, *args, **kwargs)
10101
10102    def set_vocab(self):
10103        self._set_vocab_gpt2()
10104
10105    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10106        tensors = list(super().modify_tensors(data_torch, name, bid))
10107        tensor = tensors[0][1]
10108
10109        if "down_proj" in name:
10110            tensor = tensor  * self.mlp_multipliers[1]
10111        elif "gate_proj" in name:
10112            tensor = tensor * self.mlp_multipliers[0]
10113        elif "k_proj" in name:
10114            tensor = tensor * self.key_multiplier * self.attention_in_multiplier
10115        elif "q_proj" in name:
10116            tensor = tensor * self.attention_in_multiplier
10117        elif "v_proj" in name:
10118            tensor = tensor * self.attention_in_multiplier
10119        elif "o_proj" in name:
10120            tensor = tensor * self.attention_out_multiplier
10121        elif "out_proj" in name:
10122            tensor = tensor * self.ssm_out_multiplier
10123        elif "in_proj" in name:
10124            tensor = tensor * self.ssm_in_multiplier
10125            zxbcdt_multipliers = self.hparams["ssm_multipliers"]
10126            intermediate_size = self.hparams["mamba_d_ssm"]
10127            groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
10128            tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
10129            tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
10130            tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
10131            tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
10132            tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
10133        elif "lm_head" in name:
10134            tensor = tensor * self.hparams["lm_head_multiplier"]
10135        elif "embed_tokens" in name:
10136            tensor = tensor * self.hparams["embedding_multiplier"]
10137        elif "mamba.norm" in name:
10138            tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
10139
10140        tensors = [(tensors[0][0], tensor)]
10141        return tensors
10142
10143    def set_gguf_parameters(self):
10144        super().set_gguf_parameters()
10145
10146        ## General Params ##
10147        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
10148        # Override some Mamba2 defaults
10149        self.gguf_writer.add_block_count(self.block_count)
10150        self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
10151        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
10152
10153        ## Attention params ##
10154        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
10155        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
10156        self.gguf_writer.add_key_length(self.hparams["head_dim"])
10157        self.gguf_writer.add_value_length(self.hparams["head_dim"])
10158
10159        ## Validation ##
10160        assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
10161        assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
10162
10163        # Add any other Falcon Mamba2 specific configuration
10164        self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"])
10165
10166
10167@ModelBase.register("HunYuanMoEV1ForCausalLM")
10168class HunYuanMoEModel(TextModel):
10169    model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
10170
10171    def set_vocab(self):
10172        from transformers import AutoTokenizer
10173        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
10174
10175        # 1. Get the pre-tokenizer identifier hash
10176        tokpre = self.get_vocab_base_pre(tokenizer)
10177
10178        # 2. Reverse-engineer the merges list from mergeable_ranks
10179        merges = []
10180        vocab = {}
10181        mergeable_ranks = tokenizer.mergeable_ranks
10182        for token, rank in mergeable_ranks.items():
10183            vocab[QwenModel.token_bytes_to_string(token)] = rank
10184            if len(token) == 1:
10185                continue
10186            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
10187            if len(merged) == 2: # todo this is an assert in Qwen, why?
10188                merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
10189
10190        # 3. Generate the tokens and toktypes lists
10191        vocab_size = self.hparams["vocab_size"]
10192        assert tokenizer.vocab_size == vocab_size
10193        special_tokens = tokenizer.special_tokens
10194        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
10195        tokens: list[str] = []
10196        toktypes: list[int] = []
10197        for i in range(vocab_size):
10198            if i not in reverse_vocab:
10199                tokens.append(f"[PAD{i}]")
10200                toktypes.append(gguf.TokenType.UNUSED)
10201            else:
10202                token = reverse_vocab[i]
10203                tokens.append(token)
10204                if i in special_tokens.values():
10205                    toktypes.append(gguf.TokenType.CONTROL)
10206                else:
10207                    toktypes.append(gguf.TokenType.NORMAL)
10208
10209        # 4. Write all vocab-related fields to the GGUF writer
10210        self.gguf_writer.add_tokenizer_model("gpt2")
10211        self.gguf_writer.add_tokenizer_pre(tokpre)
10212        self.gguf_writer.add_token_list(tokens)
10213        self.gguf_writer.add_token_types(toktypes)
10214        self.gguf_writer.add_token_merges(merges)
10215
10216        # 5. Add special tokens and chat templates
10217        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
10218        special_vocab.add_to_gguf(self.gguf_writer)
10219        # FIX for BOS token: Overwrite incorrect id read from config.json
10220        self.gguf_writer.add_bos_token_id(127959) # <|bos|>
10221
10222    def set_gguf_parameters(self):
10223        super().set_gguf_parameters()
10224        hparams = self.hparams
10225
10226        self.gguf_writer.add_expert_count(hparams["num_experts"])
10227        self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
10228
10229        moe_intermediate_size = hparams["moe_intermediate_size"]
10230        assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
10231        self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
10232
10233        moe_topk = hparams["moe_topk"]
10234        assert all(topk == moe_topk[0] for topk in moe_topk)
10235        self.gguf_writer.add_expert_used_count(moe_topk[0])
10236
10237        moe_shared_expert = hparams["num_shared_expert"]
10238        assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
10239        self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
10240
10241        # Rope
10242        if self.rope_parameters.get("rope_type") == "dynamic":
10243            # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
10244            # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
10245            alpha = self.rope_parameters.get("alpha", 1000)
10246            base = self.rope_parameters.get("rope_theta", 10000.0)
10247            dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
10248            scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
10249            self.gguf_writer.add_rope_freq_base(scaled_base)
10250            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
10251            self.gguf_writer.add_rope_scaling_factor(1)
10252            # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
10253            self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
10254            self.gguf_writer.add_context_length(256 * 1024) # 256k context length
10255
10256            # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
10257            assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
10258                "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
10259
10260    _experts: list[dict[str, Tensor]] | None = None
10261
10262    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10263        if name == "lm_head.weight":
10264            if self.hparams.get("tie_word_embeddings", False):
10265                logger.info("Skipping tied output layer 'lm_head.weight'")
10266                return
10267
10268        if name.find("mlp.experts") != -1:
10269            n_experts = self.hparams["num_experts"]
10270            assert bid is not None
10271
10272            if self._experts is None:
10273                self._experts = [{} for _ in range(self.block_count)]
10274
10275            self._experts[bid][name] = data_torch
10276
10277            if len(self._experts[bid]) >= n_experts * 3:
10278                # merge the experts into a single 3d tensor
10279                for w_name in ["down_proj", "gate_proj", "up_proj"]:
10280                    datas: list[Tensor] = []
10281
10282                    for xid in range(n_experts):
10283                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
10284                        datas.append(self._experts[bid][ename])
10285                        del self._experts[bid][ename]
10286
10287                    data_torch = torch.stack(datas, dim=0)
10288                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
10289
10290                    yield from super().modify_tensors(data_torch, merged_name, bid)
10291                return
10292            else:
10293                return
10294
10295        yield from super().modify_tensors(data_torch, name, bid)
10296
10297    def prepare_tensors(self):
10298        super().prepare_tensors()
10299        if self._experts is not None:
10300            experts = [k for d in self._experts for k in d.keys()]
10301            if len(experts) > 0:
10302                raise ValueError(f"Unprocessed experts: {experts}")
10303
10304
10305@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM")
10306class LLaDAMoEModel(TextModel):
10307    model_arch = gguf.MODEL_ARCH.LLADA_MOE
10308
10309    def set_gguf_parameters(self):
10310        super().set_gguf_parameters()
10311        if (n_experts := self.hparams.get("num_experts")) is not None:
10312            self.gguf_writer.add_expert_count(n_experts)
10313
10314        if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
10315            self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
10316
10317        # number of experts used per token (top-k)
10318        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
10319            self.gguf_writer.add_expert_used_count(n_experts_used)
10320
10321        self.gguf_writer.add_mask_token_id(156895)
10322        self.gguf_writer.add_causal_attention(False)
10323        self.gguf_writer.add_diffusion_shift_logits(False)
10324
10325    _experts: list[dict[str, Tensor]] | None = None
10326
10327    # Copied from: Qwen2MoeModel
10328    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10329        # process the experts separately
10330        if name.find("experts") != -1:
10331            n_experts = self.hparams["num_experts"]
10332            assert bid is not None
10333
10334            if self._experts is None:
10335                self._experts = [{} for _ in range(self.block_count)]
10336
10337            self._experts[bid][name] = data_torch
10338
10339            if len(self._experts[bid]) >= n_experts * 3:
10340                # merge the experts into a single 3d tensor
10341                for w_name in ["down_proj", "gate_proj", "up_proj"]:
10342                    datas: list[Tensor] = []
10343
10344                    for xid in range(n_experts):
10345                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
10346                        datas.append(self._experts[bid][ename])
10347                        del self._experts[bid][ename]
10348
10349                    data_torch = torch.stack(datas, dim=0)
10350
10351                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
10352
10353                    yield from super().modify_tensors(data_torch, merged_name, bid)
10354                return
10355            else:
10356                return
10357
10358        yield from super().modify_tensors(data_torch, name, bid)
10359
10360    # Copied from: Qwen2MoeModel
10361    def prepare_tensors(self):
10362        super().prepare_tensors()
10363
10364        if self._experts is not None:
10365            # flatten `list[dict[str, Tensor]]` into `list[str]`
10366            experts = [k for d in self._experts for k in d.keys()]
10367            if len(experts) > 0:
10368                raise ValueError(f"Unprocessed experts: {experts}")
10369
10370
10371@ModelBase.register("HunYuanDenseV1ForCausalLM")
10372class HunYuanModel(TextModel):
10373    model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
10374
10375    def set_vocab(self):
10376        if (self.dir_model / "tokenizer.json").is_file():
10377            self._set_vocab_gpt2()
10378        else:
10379            from transformers import AutoTokenizer
10380            tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
10381
10382            # 1. Get the pre-tokenizer identifier hash
10383            tokpre = self.get_vocab_base_pre(tokenizer)
10384
10385            # 2. Reverse-engineer the merges list from mergeable_ranks
10386            merges = []
10387            vocab = {}
10388            mergeable_ranks = tokenizer.mergeable_ranks
10389            for token, rank in mergeable_ranks.items():
10390                vocab[QwenModel.token_bytes_to_string(token)] = rank
10391                if len(token) == 1:
10392                    continue
10393                merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
10394                if len(merged) == 2:
10395                    merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
10396
10397            # 3. Generate the tokens and toktypes lists
10398            vocab_size = self.hparams["vocab_size"]
10399            assert tokenizer.vocab_size == vocab_size
10400            special_tokens = tokenizer.special_tokens
10401            reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
10402            tokens: list[str] = []
10403            toktypes: list[int] = []
10404            for i in range(vocab_size):
10405                if i not in reverse_vocab:
10406                    tokens.append(f"[PAD{i}]")
10407                    toktypes.append(gguf.TokenType.UNUSED)
10408                else:
10409                    token = reverse_vocab[i]
10410                    tokens.append(token)
10411                    if i in special_tokens.values():
10412                        toktypes.append(gguf.TokenType.CONTROL)
10413                    else:
10414                        toktypes.append(gguf.TokenType.NORMAL)
10415
10416            # 4. Write all vocab-related fields to the GGUF writer
10417            self.gguf_writer.add_tokenizer_model("gpt2")
10418            self.gguf_writer.add_tokenizer_pre(tokpre)
10419            self.gguf_writer.add_token_list(tokens)
10420            self.gguf_writer.add_token_types(toktypes)
10421            self.gguf_writer.add_token_merges(merges)
10422
10423            # 5. Add special tokens and chat templates
10424            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
10425            special_vocab.add_to_gguf(self.gguf_writer)
10426            # FIX for BOS token: Overwrite incorrect id read from config.json
10427            if self.hparams['hidden_size'] == 4096:
10428                self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
10429
10430    def set_gguf_parameters(self):
10431        super().set_gguf_parameters()
10432        hparams = self.hparams
10433
10434        # Rope
10435        if self.rope_parameters.get("rope_type") == "dynamic":
10436            # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
10437            # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
10438            alpha = self.rope_parameters.get("alpha", 50)
10439            base = self.rope_parameters.get("rope_theta", 10000.0)
10440            dim = hparams["head_dim"]
10441            scaled_base = base * (alpha ** (dim / (dim - 2)))
10442            self.gguf_writer.add_rope_freq_base(scaled_base)
10443            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
10444            self.gguf_writer.add_rope_scaling_factor(1)
10445            # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
10446            self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
10447            self.gguf_writer.add_context_length(256 * 1024) # 256k context length
10448
10449            # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
10450            assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
10451                "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
10452
10453    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10454        if name == "lm_head.weight":
10455            if self.hparams.get("tie_word_embeddings", False):
10456                logger.info("Skipping tied output layer 'lm_head.weight'")
10457                return
10458
10459        yield from super().modify_tensors(data_torch, name, bid)
10460
10461
10462@ModelBase.register("SmolLM3ForCausalLM")
10463class SmolLM3Model(LlamaModel):
10464    model_arch = gguf.MODEL_ARCH.SMOLLM3
10465
10466
10467@ModelBase.register("GptOssForCausalLM")
10468class GptOssModel(TextModel):
10469    model_arch = gguf.MODEL_ARCH.GPT_OSS
10470
10471    # TODO: remove once MXFP4 is supported more generally
10472    def dequant_model(self):
10473        quant_config = self.hparams.get("quantization_config")
10474        if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
10475            return
10476        return super().dequant_model()
10477
10478    def transform_nibble_layout(self, tensor):
10479        assert tensor.dtype == torch.uint8
10480        assert tensor.shape[-1] == 16
10481        # swap nibbles
10482        t_lo = tensor & 0x0F
10483        t_hi = tensor & 0xF0
10484        t_swapped = (t_lo << 4) | (t_hi >> 4)
10485        tensor = t_swapped
10486        # transform aaaa...bbbb... to abababab...
10487        blk_a, blk_b = tensor.chunk(2, dim=-1)
10488        # get a_
10489        blk_a0 = (blk_a & 0xF0).view(-1, 1)
10490        blk_a1 = (blk_a << 4).view(-1, 1)
10491        blk_a = torch.stack((blk_a0, blk_a1), dim=2).view(tensor.shape)
10492        # get _b
10493        blk_b0 = (blk_b >> 4).view(-1, 1)
10494        blk_b1 = (blk_b & 0x0F).view(-1, 1)
10495        blk_b = torch.stack((blk_b0, blk_b1), dim=2).view(tensor.shape)
10496        # swap once more
10497        out = blk_a | blk_b
10498        out_h = out & 0xF0
10499        out_l = out & 0x0F
10500        out = (out_h >> 4) | (out_l << 4)
10501        return out
10502
10503    def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor):
10504        assert blocks.dtype == torch.uint8
10505        assert scales.dtype == torch.uint8
10506        scales = scales.unsqueeze(-1)
10507        assert len(blocks.shape) == 4
10508        assert len(scales.shape) == 4
10509        blocks = self.transform_nibble_layout(blocks)
10510        new_data = torch.concat((scales, blocks), dim=-1)
10511        new_shape = [new_data.shape[0], new_data.shape[1], new_data.shape[2] * 32]
10512        logger.info(f"Repacked {new_name} with shape {new_shape} and quantization MXFP4")
10513        # flatten last dim
10514        new_data = new_data.view(new_data.shape[0], new_data.shape[1], new_data.shape[2] * new_data.shape[3])
10515        new_data = new_data.numpy()
10516        self.gguf_writer.add_tensor(new_name, new_data, raw_dtype=gguf.GGMLQuantizationType.MXFP4)
10517
10518    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
10519        blocks0: Tensor = torch.zeros(1)
10520        blocks1: Tensor = torch.zeros(1)
10521        # we assume that tensors are loaded in the correct order
10522        for name, data_torch in self.get_tensors():
10523            if "mlp.experts.down_proj_blocks" in name:
10524                blocks0 = data_torch
10525            elif "mlp.experts.down_proj_scales" in name:
10526                new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
10527                self.repack_mxfp4(new_name, blocks0, data_torch)
10528            elif "mlp.experts.gate_up_proj_blocks" in name:
10529                blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
10530            elif "mlp.experts.gate_up_proj_scales" in name:
10531                scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :]
10532                new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight"))
10533                new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
10534                self.repack_mxfp4(new_name_gate, blocks0, scales0)
10535                self.repack_mxfp4(new_name_up, blocks1, scales1)
10536        return []
10537
10538    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10539        if "sinks" in name:
10540            name += ".weight"
10541
10542        # correct naming for down_proj
10543        if "down_proj" in name:
10544            if name.endswith("_bias"):
10545                name = name.replace("down_proj_bias", "down_proj.bias")
10546            elif "_blocks" not in name and "_scales" not in name:
10547                logger.warning(f"{name} is not in MXFP4, performance may be degraded")
10548                name = name.replace("down_proj", "down_proj.weight")
10549                data_torch = data_torch.transpose(-1, -2)
10550            else:
10551                # otherwise, it should already be repacked to ggml MXFP4 format
10552                return
10553
10554        # split the gate_up into gate and up
10555        if "gate_up_proj" in name:
10556            if name.endswith("_bias"):
10557                name_up = name.replace("gate_up_proj_bias", "up_proj.bias")
10558                name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias")
10559                gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2]
10560                yield from super().modify_tensors(gate_proj_bias, name_gate, bid)
10561                yield from super().modify_tensors(up_proj_bias, name_up, bid)
10562            elif "_blocks" not in name and "_scales" not in name:
10563                logger.warning(f"{name} is not in MXFP4, performance may be degraded")
10564                name_up = name.replace("gate_up_proj", "up_proj.weight")
10565                name_gate = name.replace("gate_up_proj", "gate_proj.weight")
10566                data_torch = data_torch.transpose(-1, -2)
10567                gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :]
10568                yield from super().modify_tensors(gate_proj_weight, name_gate, bid)
10569                yield from super().modify_tensors(up_proj_weight, name_up, bid)
10570        else:
10571            yield from super().modify_tensors(data_torch, name, bid)
10572
10573    def set_vocab(self):
10574        self._set_vocab_gpt2()
10575
10576    def set_gguf_parameters(self):
10577        super().set_gguf_parameters()
10578        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
10579        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
10580
10581
10582@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM")
10583class LFM2Model(TextModel):
10584    model_arch = gguf.MODEL_ARCH.LFM2
10585
10586    def _add_feed_forward_length(self):
10587        ff_dim = self.hparams["block_ff_dim"]
10588
10589        auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
10590        ff_dim = self.hparams["block_ff_dim"]
10591        ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
10592        multiple_of = self.hparams["block_multiple_of"]
10593
10594        if auto_adjust_ff_dim:
10595            ff_dim = int(2 * ff_dim / 3)
10596            # custom dim factor multiplier
10597            if ffn_dim_multiplier is not None:
10598                ff_dim = int(ffn_dim_multiplier * ff_dim)
10599            ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
10600
10601        self.gguf_writer.add_feed_forward_length(ff_dim)
10602
10603    def set_gguf_parameters(self):
10604        # set num_key_value_heads only for attention layers
10605        self.hparams["num_key_value_heads"] = [
10606            self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
10607            for layer_type in self.hparams["layer_types"]
10608        ]
10609
10610        super().set_gguf_parameters()
10611        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
10612        self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
10613        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
10614        self._add_feed_forward_length()
10615
10616    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10617        if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name):
10618            # skip multimodal tensors
10619            return
10620
10621        name = name.replace("language_model.", "") # vision
10622        name = name.replace("lfm.", "model.")      # audio
10623
10624        # conv op requires 2d tensor
10625        if 'conv.conv' in name:
10626            data_torch = data_torch.squeeze(1)
10627
10628        yield from super().modify_tensors(data_torch, name, bid)
10629
10630    def _is_vision_tensor(self, name: str) -> bool:
10631        return "vision_tower" in name or "multi_modal_projector" in name
10632
10633
10634@ModelBase.register("Lfm2Model")
10635class LFM2ColBertModel(LFM2Model):
10636    model_arch = gguf.MODEL_ARCH.LFM2
10637    dense_tensor_name = "dense_2"
10638
10639    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10640        if not name.startswith(self.dense_tensor_name):
10641            name = "model." + name
10642
10643        yield from super().modify_tensors(data_torch, name, bid)
10644
10645    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
10646        # dense tensor is stored in a separate safetensors file
10647        from safetensors.torch import load_file
10648        tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
10649        assert tensors_file.is_file()
10650        tensor = load_file(tensors_file)["linear.weight"]
10651        self.gguf_writer.add_embedding_length_out(tensor.shape[0])
10652        yield f"{self.dense_tensor_name}.weight", tensor.clone()
10653
10654
10655@ModelBase.register("Lfm2MoeForCausalLM")
10656class LFM2MoeModel(TextModel):
10657    model_arch = gguf.MODEL_ARCH.LFM2MOE
10658
10659    def set_gguf_parameters(self):
10660        # set num_key_value_heads only for attention layers
10661        self.hparams["num_key_value_heads"] = [
10662            self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
10663            for layer_type in self.hparams["layer_types"]
10664        ]
10665
10666        super().set_gguf_parameters()
10667
10668        self.gguf_writer.add_expert_count(self.hparams["num_experts"])
10669        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
10670        self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"])
10671        self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
10672
10673        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
10674        self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
10675
10676    # cache for experts weights for merging
10677    _experts_cache: dict[int, dict[str, Tensor]] = {}
10678
10679    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10680        # conv op requires 2d tensor
10681        if 'conv.conv' in name:
10682            data_torch = data_torch.squeeze(1)
10683
10684        if name.endswith(".expert_bias"):
10685            name = name.replace(".expert_bias", ".expert_bias.bias")
10686
10687        # merge expert weights
10688        if 'experts' in name:
10689            n_experts = self.hparams["num_experts"]
10690            assert bid is not None
10691
10692            expert_cache = self._experts_cache.setdefault(bid, {})
10693            expert_cache[name] = data_torch
10694            expert_weights = ["w1", "w2", "w3"]
10695
10696            # not enough expert weights to merge
10697            if len(expert_cache) < n_experts * len(expert_weights):
10698                return
10699
10700            for w_name in expert_weights:
10701                datas: list[Tensor] = []
10702
10703                for xid in range(n_experts):
10704                    ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight"
10705                    datas.append(expert_cache[ename])
10706                    del expert_cache[ename]
10707
10708                data_torch = torch.stack(datas, dim=0)
10709                merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight"
10710
10711                yield from super().modify_tensors(data_torch, merged_name, bid)
10712
10713            del self._experts_cache[bid]
10714            return
10715
10716        yield from super().modify_tensors(data_torch, name, bid)
10717
10718    def prepare_tensors(self):
10719        super().prepare_tensors()
10720        assert not self._experts_cache
10721
10722
10723@ModelBase.register("Lfm2VlForConditionalGeneration")
10724class LFM2VLModel(MmprojModel):
10725    def __init__(self, *args, **kwargs):
10726        super().__init__(*args, **kwargs)
10727        assert self.hparams_vision is not None
10728        # TODO(tarek): for dynamic resolution image_size is not specified, setting here for compatibility
10729        self.hparams_vision["image_size"] = 256
10730
10731    def set_gguf_parameters(self):
10732        super().set_gguf_parameters()
10733        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2)
10734        self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"]))
10735        self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2))
10736        self.gguf_writer.add_vision_use_gelu(True)
10737        # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0
10738        vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer", -1) + 1)
10739        self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop)
10740
10741    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10742        is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
10743
10744        if is_vision_tensor:
10745            # remove "model." prefix
10746            name = name.replace("model.vision_tower.", "vision_tower.")
10747            name = name.replace("model.multi_modal_projector.", "multi_modal_projector.")
10748
10749            if "patch_embedding.weight" in name:
10750                data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2)
10751
10752            yield from super().modify_tensors(data_torch, name, bid)
10753            return
10754
10755        return # skip other tensors
10756
10757
10758@ModelBase.register("Lfm2AudioForConditionalGeneration")
10759class LFM2AudioModel(ConformerAudioModel):
10760    has_vision_encoder = False
10761    has_audio_encoder = True
10762    model_name = "Lfm2AudioEncoder"
10763
10764    def get_audio_config(self) -> dict[str, Any] | None:
10765        return self.global_config.get("encoder")
10766
10767    def set_gguf_parameters(self):
10768        assert self.hparams_audio is not None
10769        self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
10770        self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"]
10771        self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"]
10772        super().set_gguf_parameters()
10773        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A)
10774        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
10775        self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
10776
10777    def modify_tensors(self, data_torch, name, bid):
10778        # skip language model tensors
10779        if name.startswith("lfm."):
10780            return
10781
10782        # for training only
10783        if any(p in name for p in ["audio_loss_weight"]):
10784            return
10785
10786        # for audio output
10787        if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
10788            return
10789
10790        yield from super().modify_tensors(data_torch, name, bid)
10791
10792
10793@ModelBase.register("SmallThinkerForCausalLM")
10794class SmallThinkerModel(TextModel):
10795    model_arch = gguf.MODEL_ARCH.SMALLTHINKER
10796
10797    def set_gguf_parameters(self):
10798        super().set_gguf_parameters()
10799        if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None:
10800            self.gguf_writer.add_expert_count(n_experts)
10801        if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None:
10802            self.gguf_writer.add_expert_used_count(n_experts_used)
10803        if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None:
10804            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
10805            self.gguf_writer.add_feed_forward_length(moe_intermediate_size)
10806            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
10807        if (self.hparams.get('moe_primary_router_apply_softmax')):
10808            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
10809        else:
10810            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
10811
10812        sliding_window_layout = self.hparams.get("sliding_window_layout")
10813        if sliding_window_layout:
10814            for i in sliding_window_layout:
10815                if i != 0:
10816                    sliding_window = self.hparams.get("sliding_window_size")
10817                    if sliding_window:
10818                        self.gguf_writer.add_sliding_window(sliding_window)
10819                    break
10820
10821    _experts: list[dict[str, Tensor]] | None = None
10822
10823    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10824        # process the experts separately
10825        if name.find("experts") != -1:
10826            n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))
10827            assert bid is not None
10828
10829            if self._experts is None:
10830                self._experts = [{} for _ in range(self.block_count)]
10831
10832            self._experts[bid][name] = data_torch
10833
10834            if len(self._experts[bid]) >= n_experts * 3:
10835                # merge the experts into a single 3d tensor
10836                for w_name in ["down", "gate", "up"]:
10837                    datas: list[Tensor] = []
10838
10839                    for xid in range(n_experts):
10840                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
10841                        datas.append(self._experts[bid][ename])
10842                        del self._experts[bid][ename]
10843
10844                    data_torch = torch.stack(datas, dim=0)
10845
10846                    merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
10847
10848                    yield from super().modify_tensors(data_torch, merged_name, bid)
10849                return
10850            else:
10851                return
10852
10853        yield from super().modify_tensors(data_torch, name, bid)
10854
10855    def prepare_tensors(self):
10856        super().prepare_tensors()
10857
10858        if self._experts is not None:
10859            # flatten `list[dict[str, Tensor]]` into `list[str]`
10860            experts = [k for d in self._experts for k in d.keys()]
10861            if len(experts) > 0:
10862                raise ValueError(f"Unprocessed experts: {experts}")
10863
10864
10865@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
10866class ModernBertModel(BertModel):
10867    model_arch = gguf.MODEL_ARCH.MODERN_BERT
10868
10869    def set_vocab(self):
10870        self.gguf_writer.add_add_bos_token(True)
10871        self.gguf_writer.add_add_eos_token(True)
10872        self.gguf_writer.add_add_sep_token(True)
10873        self._set_vocab_gpt2()
10874
10875    def set_gguf_parameters(self):
10876        super().set_gguf_parameters()
10877        self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
10878        if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
10879            self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
10880        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
10881        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
10882
10883    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10884        # these layers act as MLM head, so we don't need them
10885        if name.startswith("decoder."):
10886            return
10887
10888        if name.startswith("model."):
10889            name = name[6:]
10890
10891        yield from super().modify_tensors(data_torch, name, bid)
10892
10893
10894@ModelBase.register("ApertusForCausalLM")
10895class ApertusModel(LlamaModel):
10896    model_arch = gguf.MODEL_ARCH.APERTUS
10897    undo_permute = False
10898
10899    _alpha_n = {}
10900    _alpha_p = {}
10901    _beta = {}
10902    _eps = {}
10903
10904    def modify_tensors(self, data_torch, name, bid):
10905        # Handle xIELU activation parameters
10906        n_layers = self.hparams["num_hidden_layers"]
10907        if name.endswith(".act_fn.alpha_n"):
10908            self._alpha_n[bid] = data_torch.to("cpu").float().item()
10909            if (len(self._alpha_n) == n_layers):
10910                self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)])
10911            return
10912        if name.endswith(".act_fn.alpha_p"):
10913            self._alpha_p[bid] = data_torch.to("cpu").float().item()
10914            if (len(self._alpha_p) == n_layers):
10915                self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)])
10916            return
10917        if name.endswith(".act_fn.beta"):
10918            self._beta[bid] = data_torch.to("cpu").float().item()
10919            if (len(self._beta) == n_layers):
10920                self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)])
10921            return
10922        if name.endswith(".act_fn.eps"):
10923            self._eps[bid] = data_torch.to("cpu").float().item()
10924            if (len(self._eps) == n_layers):
10925                self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)])
10926            return
10927
10928        yield from super().modify_tensors(data_torch, name, bid)
10929
10930
10931class MistralModel(LlamaModel):
10932    model_arch = gguf.MODEL_ARCH.MISTRAL3
10933    model_name = "Mistral"
10934    hf_arch = ""
10935    is_mistral_format = True
10936    undo_permute = False
10937
10938    def __init__(self, *args, **kwargs):
10939        super().__init__(*args, **kwargs)
10940        # for compatibility, we use LLAMA arch for older models
10941        # TODO: remove this once everyone migrates to newer version of llama.cpp
10942        if "llama_4_scaling" not in self.hparams:
10943            self.model_arch = gguf.MODEL_ARCH.LLAMA
10944            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
10945            self.gguf_writer.add_architecture()
10946            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
10947
10948    def dequant_model(self):
10949        # transform quantization config into HF format
10950        quant_config = self.hparams.get("quantization")
10951        if quant_config is not None:
10952            assert quant_config["qformat_weight"] == "fp8_e4m3"
10953            self.hparams["quantization_config"] = {
10954                "activation_scheme": "static",
10955                "quant_method": "fp8",
10956                "weight_block_size": None,
10957            }
10958        return super().dequant_model()
10959
10960    @staticmethod
10961    def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
10962        assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
10963        assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
10964            f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
10965        )
10966
10967        if vocab.tokenizer.version == TokenizerVersion.v1:
10968            return "mistral-v1"
10969        elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm:
10970            return "mistral-v3"
10971        elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken:
10972            return "mistral-v3-tekken"
10973        elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm:
10974            return "mistral-v7"
10975        elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken:
10976            return "mistral-v7-tekken"
10977        elif vocab.tokenizer.version == TokenizerVersion.v11:
10978            template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja"
10979        elif vocab.tokenizer.version == TokenizerVersion.v13:
10980            template_file = "unsloth-mistral-Devstral-Small-2507.jinja"
10981        else:
10982            err_message = f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}"
10983            if is_mistral_format:
10984                err_message += (
10985                    " . Please pass --disable-mistral-community-chat-template argument to the CLI "
10986                    "if you want to skip this error and use the Mistral official `mistral-common` pre-processing library."
10987                )
10988            raise ValueError(err_message)
10989
10990        template_path = templates_dir / template_file
10991        if not template_path.exists():
10992            raise FileNotFoundError(f"Template file not found: {template_path}")
10993
10994        with open(template_path, "r", encoding="utf-8") as f:
10995            template = f.read()
10996
10997        return template
10998
10999    def set_gguf_parameters(self):
11000        super().set_gguf_parameters()
11001        MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
11002
11003    @staticmethod
11004    def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict):
11005        if "yarn" in hparams:
11006            yarn_params = hparams["yarn"]
11007            gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
11008            gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
11009            gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
11010            gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
11011            gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
11012            gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
11013
11014        if "llama_4_scaling" in hparams:
11015            gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
11016
11017
11018class MistralMoeModel(DeepseekV2Model):
11019    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
11020    model_name = "Mistral"
11021    hf_arch = ""
11022    is_mistral_format = True
11023
11024    def __init__(self, *args, **kwargs):
11025        super().__init__(*args, **kwargs)
11026        logger.info("Using MistralMoeModel")
11027        # remap hparams from Mistral MoE format to DeepseekV2 format
11028        # we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic
11029        # ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py
11030        config = self.hparams
11031        # Mistral key -> HF key
11032        config_mapping = {
11033            "dim": "hidden_size",
11034            "norm_eps": "rms_norm_eps",
11035            "n_kv_heads": "num_key_value_heads",
11036            "n_layers": "num_hidden_layers",
11037            "n_heads": "num_attention_heads",
11038            "hidden_dim": "intermediate_size",
11039        }
11040        # HF key -> (Mistral key, default value)
11041        top_level_mapping_with_default = {
11042            "model_type": ("model_type", "transformer"),
11043            "hidden_act": ("activation", "silu"),
11044            "tie_word_embeddings": ("tied_embeddings", False),
11045            "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
11046            "max_position_embeddings": ("max_position_embeddings", 128_000),
11047        }
11048        # mapping top-level keys
11049        for key, new_key in config_mapping.items():
11050            if key in config:
11051                config[new_key] = config[key]
11052        for new_key, (key, default_value) in top_level_mapping_with_default.items():
11053            config[new_key] = config.get(key, default_value)
11054        # mapping MoE-specific keys
11055        moe_config_map = {
11056            "route_every_n": "moe_layer_freq",
11057            "first_k_dense_replace": "first_k_dense_replace",
11058            "num_experts_per_tok": "num_experts_per_tok",
11059            "num_experts": "n_routed_experts",
11060            "expert_hidden_dim": "moe_intermediate_size",
11061            "routed_scale": "routed_scaling_factor",
11062            "num_shared_experts": "n_shared_experts",
11063            "num_expert_groups": "n_group",
11064            "num_expert_groups_per_tok": "topk_group",
11065        }
11066        moe = config["moe"]
11067        for key, new_key in moe_config_map.items():
11068            if key in moe:
11069                config[new_key] = moe[key]
11070        # provide missing values
11071        config["topk_method"] = None
11072        config["norm_topk_prob"] = True
11073        config["scoring_func"] = "softmax"
11074
11075    def set_vocab(self):
11076        self._set_vocab_mistral()
11077
11078    def set_gguf_parameters(self):
11079        super().set_gguf_parameters()
11080        MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
11081        yarn_params = self.hparams["yarn"]
11082        self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
11083
11084        # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
11085        # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
11086        # ref https://github.com/ggml-org/llama.cpp/pull/17945
11087        self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
11088
11089    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
11090        if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name:
11091            return
11092
11093        # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic
11094        if name.endswith(".qscale_act"):
11095            name = name.replace(".qscale_act", ".input_scale")
11096        if name.endswith(".qscale_weight"):
11097            name = name.replace(".qscale_weight", ".weight_scale")
11098        if ".wkv_b." in name:
11099            name = name.replace(".wkv_b.", ".kv_b_proj.")
11100        if ".experts." in name:
11101            name = name.replace(".experts.", ".mlp.experts.")
11102            name = name.replace(".w1.", ".gate_proj.")
11103            name = name.replace(".w2.", ".down_proj.")
11104            name = name.replace(".w3.", ".up_proj.")
11105            name = "model." + name
11106
11107        yield from super().modify_tensors(data_torch, name, bid)
11108
11109
11110class PixtralModel(LlavaVisionModel):
11111    model_name = "Pixtral"
11112    hf_arch = ""
11113    is_mistral_format = True
11114
11115    def set_gguf_parameters(self):
11116        super().set_gguf_parameters()
11117        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
11118
11119        self.gguf_writer.add_vision_attention_layernorm_eps(
11120            self.find_hparam(["norm_eps"])
11121        )
11122        self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"]))
11123
11124        self.gguf_writer.add_vision_use_silu(True)
11125
11126        # spatial_merge_size
11127        if self.find_vparam(["mm_projector_id"]) == "patch_merge":
11128            self.gguf_writer.add_vision_spatial_merge_size(
11129                self.find_vparam(["spatial_merge_size"])
11130            )
11131
11132    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
11133        if name == "vision_language_adapter.w_in.weight":
11134            return "mm.1.weight"
11135        elif name == "vision_language_adapter.w_out.weight":
11136            return "mm.2.weight"
11137        return super().map_tensor_name(name, try_suffixes)
11138
11139
11140@ModelBase.register("LightOnOCRForConditionalGeneration")
11141class LightOnOCRVisionModel(LlavaVisionModel):
11142    is_mistral_format = False
11143    use_break_tok = False
11144
11145    def set_gguf_parameters(self):
11146        super().set_gguf_parameters()
11147        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR)
11148
11149    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
11150        name = name.replace("model.vision_encoder.", "vision_tower.")
11151        name = name.replace("model.vision_projection.", "multi_modal_projector.")
11152        yield from super().modify_tensors(data_torch, name, bid)
11153
11154
11155@ModelBase.register("KimiVLForConditionalGeneration")
11156class KimiVLModel(MmprojModel):
11157    def __init__(self, *args, **kwargs):
11158        super().__init__(*args, **kwargs)
11159        assert self.hparams_vision is not None
11160        self.hparams_vision["image_size"] = 64 * 14 # for compatibility
11161
11162    def set_gguf_parameters(self):
11163        super().set_gguf_parameters()
11164        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL)
11165        self.gguf_writer.add_vision_use_gelu(True)
11166        self.gguf_writer.add_vision_projector_scale_factor(2)
11167        # eps is the same as pytorch's default value
11168        assert self.hparams_vision is not None
11169        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5))
11170
11171    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11172        is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
11173
11174        if is_vision_tensor:
11175            if "pos_emb.weight" in name:
11176                data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2])
11177
11178            if "wqkv" in name:
11179                split_dim = 0 if "weight" in name else -1
11180                wq, wk, wv = data_torch.chunk(3, dim=split_dim)
11181                yield from super().modify_tensors(wq, name.replace("wqkv", "wq"), bid)
11182                yield from super().modify_tensors(wk, name.replace("wqkv", "wk"), bid)
11183                yield from super().modify_tensors(wv, name.replace("wqkv", "wv"), bid)
11184            else:
11185                yield from super().modify_tensors(data_torch, name, bid)
11186
11187
11188@ModelBase.register("KimiK25ForConditionalGeneration")
11189class KimiK25Model(MmprojModel):
11190    """Kimi-K2.5 with MoonViT3d vision encoder"""
11191
11192    def __init__(self, *args, **kwargs):
11193        super().__init__(*args, **kwargs)
11194
11195        assert self.hparams_vision is not None, "Kimi-K2.5 requires vision_config in model config"
11196
11197        self.merge_kernel_size = tuple(self.hparams_vision.get("merge_kernel_size", [2, 2]))
11198        self.patch_size = self.hparams_vision.get("patch_size", 14)
11199
11200        # Set image_size for compatibility with base class
11201        # Use position embedding dimensions as image_size reference
11202        pos_emb_h = self.hparams_vision.get("init_pos_emb_height", 64)
11203        self.hparams_vision["image_size"] = pos_emb_h * self.patch_size
11204
11205    def set_gguf_parameters(self):
11206        # Base class MmprojModel.set_gguf_parameters() already writes:
11207        # - vision_block_count, vision_head_count, vision_embedding_length
11208        # - vision_feed_forward_length, vision_patch_size, image_mean, image_std
11209        # via find_vparam() which handles the vt_* prefixed keys in Kimi-K2.5's config
11210        super().set_gguf_parameters()
11211        assert self.hparams_vision is not None
11212
11213        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIK25)
11214
11215        # Position embedding parameters (for interpolation)
11216        self.gguf_writer.add_uint32("vision.pos_emb_height", self.hparams_vision.get("init_pos_emb_height", 64))
11217        self.gguf_writer.add_uint32("vision.pos_emb_width", self.hparams_vision.get("init_pos_emb_width", 64))
11218        self.gguf_writer.add_uint32("vision.pos_emb_time", self.hparams_vision.get("init_pos_emb_time", 4))
11219
11220        # Projector parameters
11221        self.gguf_writer.add_vision_use_gelu(self.hparams_vision.get("projector_hidden_act", "gelu") == "gelu")
11222        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5))
11223        self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0])
11224
11225        # Image size limits
11226        # Note: in_patch_limit is for images, in_patch_limit_each_frame is for video (not supported yet)
11227        in_patch_limit = self.preprocessor_config.get("in_patch_limit", 16384)
11228        min_patches = 8  # reasonable minimum
11229        pixels_per_patch = self.patch_size ** 2
11230        self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch)
11231        self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch)
11232
11233    @staticmethod
11234    def permute(weights: Tensor, n_head: int) -> Tensor:
11235        out_dim, in_dim = weights.shape
11236        head_dim = out_dim // n_head
11237        w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim)
11238        w = w.permute(0, 2, 1, 3, 4)
11239        return w.reshape(out_dim, in_dim)
11240
11241    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11242        # Only process vision and projector tensors
11243        is_vision = any(x in name for x in ["vision_tower", "mm_projector"])
11244
11245        if not is_vision:
11246            return
11247
11248        assert self.hparams_vision is not None
11249        n_head = self.hparams_vision.get("num_attention_heads", 16)
11250
11251        # Permute Q/K weights/biases from interleaved to split RoPE format
11252        # This allows using build_rope_2d at runtime without post-permutation.
11253        if "wqkv" in name:
11254            out_dim = data_torch.shape[0]
11255            qkv_dim = out_dim // 3
11256            head_dim = qkv_dim // n_head
11257
11258            if "weight" in name:
11259                wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2 * qkv_dim, :], data_torch[2 * qkv_dim:, :]
11260                wq = self.permute(wq, n_head)
11261                wk = self.permute(wk, n_head)
11262                data_torch = torch.cat([wq, wk, wv], dim=0)
11263            elif "bias" in name:
11264                bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2 * qkv_dim], data_torch[2 * qkv_dim:]
11265                bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
11266                bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
11267                data_torch = torch.cat([bq, bk, bv], dim=0)
11268
11269        # Temporal embeddings: (T, 1, C) โ†’ (T, C)
11270        if "pos_emb.time_weight" in name:
11271            T, _, C = data_torch.shape
11272            data_torch = data_torch.reshape(T, C)
11273
11274        # PatchMergerMLP tensor name mapping
11275        # proj.0.weight โ†’ proj.linear_1.weight
11276        # proj.2.weight โ†’ proj.linear_2.weight
11277        if "mm_projector.proj.0." in name:
11278            name = name.replace(".proj.0.", ".proj.linear_1.")
11279        elif "mm_projector.proj.2." in name:
11280            name = name.replace(".proj.2.", ".proj.linear_2.")
11281
11282        yield from super().modify_tensors(data_torch, name, bid)
11283
11284
11285@ModelBase.register("CogVLMForCausalLM")
11286class CogVLMVisionModel(MmprojModel):
11287
11288    def set_gguf_parameters(self):
11289        super().set_gguf_parameters()
11290        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
11291        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
11292
11293    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11294        if not name.startswith("model.vision."):
11295            return
11296
11297        yield from super().modify_tensors(data_torch, name, bid)
11298
11299
11300@ModelBase.register("CogVLMForCausalLM")
11301class CogVLMModel(LlamaModel):
11302    model_arch = gguf.MODEL_ARCH.COGVLM
11303
11304    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11305        # block vision tensors
11306        if name.startswith("model.vision."):
11307            return
11308
11309        yield from ModelBase.modify_tensors(self, data_torch, name, bid)
11310
11311
11312@ModelBase.register("JanusForConditionalGeneration")
11313class JanusProModel(LlamaModel):
11314    model_arch = gguf.MODEL_ARCH.LLAMA  # reuse Llama arch
11315
11316    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11317        # Skip vision, aligner, and generation tensors
11318        skip_prefixes = (
11319            'model.vision_model.',
11320            'model.aligner.',
11321            'model.vqmodel.',
11322            'model.generation_embeddings.',
11323            'model.generation_aligner.',
11324            'model.generation_head.',
11325        )
11326        if name.startswith(skip_prefixes):
11327            return
11328
11329        if name.startswith('model.language_model.'):
11330            name = name.replace('model.language_model.', 'model.')
11331        elif name.startswith('language_model.'):
11332            name = name.replace('language_model.', '')
11333
11334        yield from super().modify_tensors(data_torch, name, bid)
11335
11336
11337@ModelBase.register("JanusForConditionalGeneration")
11338class JanusProVisionModel(MmprojModel):
11339    def __init__(self, *args, **kwargs):
11340        super().__init__(*args, **kwargs)
11341        assert self.hparams_vision is not None
11342        if "intermediate_size" not in self.hparams_vision:
11343            mlp_ratio = self.hparams_vision.get("mlp_ratio")
11344            hidden_size = self.hparams_vision.get("hidden_size")
11345            if mlp_ratio is not None and hidden_size is not None:
11346                self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
11347
11348    def set_gguf_parameters(self):
11349        super().set_gguf_parameters()
11350        assert self.hparams_vision is not None
11351
11352        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
11353
11354        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
11355
11356        hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
11357        if hidden_act == "gelu":
11358            self.gguf_writer.add_vision_use_gelu(True)
11359        elif hidden_act == "silu":
11360            self.gguf_writer.add_vision_use_silu(True)
11361
11362    def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
11363        """Map aligner tensors to projector format"""
11364        suffix = ".bias" if name.endswith(".bias") else ".weight"
11365
11366        if name.startswith("model.aligner."):
11367            local_name = name[len("model.aligner."):]
11368        elif name.startswith("aligner."):
11369            local_name = name[len("aligner."):]
11370        else:
11371            raise ValueError(f"Unsupported Janus aligner prefix: {name}")
11372
11373        if local_name.startswith("fc1."):
11374            mm_index = 0
11375        elif local_name.startswith("hidden_layers."):
11376            parts = local_name.split(".", 2)
11377            if len(parts) < 3:
11378                raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
11379            mm_index = int(parts[1]) + 1
11380        else:
11381            raise ValueError(f"Unsupported Janus aligner tensor: {name}")
11382
11383        tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
11384        return [(tensor_name, data_torch)]
11385
11386    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11387        # Skip language model tensors as they will be handled by `JanusProModel`
11388        if name.startswith(('model.language_model.', 'language_model.')):
11389            return
11390
11391        # Skip generation-related components
11392        skip_generation_prefixes = (
11393            'model.vqmodel.',
11394            'vqmodel.',
11395            'model.generation_embeddings.',
11396            'generation_embeddings.',
11397            'model.generation_aligner.',
11398            'generation_aligner.',
11399            'model.generation_head.',
11400            'generation_head.',
11401        )
11402        if name.startswith(skip_generation_prefixes):
11403            return
11404
11405        # Handle aligner tensors
11406        if name.startswith(('model.aligner.', 'aligner.')):
11407            yield from self._map_aligner_tensor(data_torch, name)
11408            return
11409
11410        # Handle vision tensors
11411        if name.startswith(('model.vision_model.', 'vision_model.')):
11412            yield from super().modify_tensors(data_torch, name, bid)
11413            return
11414
11415        return
11416
11417
11418@ModelBase.register("YoutuVLForConditionalGeneration")
11419class YoutuVLVisionModel(MmprojModel):
11420    def __init__(self, *args, **kwargs):
11421        super().__init__(*args, **kwargs)
11422        assert self.hparams_vision is not None
11423        self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
11424
11425    def set_gguf_parameters(self):
11426        super().set_gguf_parameters()
11427
11428        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL)
11429        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
11430
11431        # Handle activation function
11432        hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower()
11433        if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"):
11434            self.gguf_writer.add_vision_use_gelu(True)
11435        elif hidden_act == "silu":
11436            self.gguf_writer.add_vision_use_silu(True)
11437        else:
11438            raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}")
11439
11440        self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2))
11441
11442        window_size = self.hparams.get("window_size")
11443        if window_size is not None:
11444            self.gguf_writer.add_vision_window_size(window_size)
11445        # fullatt_block_indexes contains explicit layer indices that use full attention
11446        # e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention
11447        # All other layers use window attention
11448        fullatt_block_indexes = self.hparams.get("fullatt_block_indexes")
11449        assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl"
11450        # Store the explicit layer indices for YoutuVL (irregular pattern approach)
11451        self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes)
11452
11453    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11454        # Skip language model tensors
11455        skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.')
11456        if name.startswith(skip_prefixes):
11457            return
11458
11459        # Try to map the tensor using TensorNameMap (handles vision encoder and projector)
11460        try:
11461            yield from super().modify_tensors(data_torch, name, bid)
11462        except ValueError:
11463            # If mapping fails, log warning and skip
11464            logger.warning(f"Cannot map tensor: {name}")
11465            return
11466
11467
11468@ModelBase.register("SolarOpenForCausalLM")
11469class SolarOpenModel(Glm4MoeModel):
11470    model_arch = gguf.MODEL_ARCH.GLM4_MOE
11471
11472    def set_vocab(self):
11473        from transformers import AutoTokenizer
11474        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
11475        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
11476        tokens, toktypes, tokpre = self.get_vocab_base()
11477        self.gguf_writer.add_tokenizer_model("gpt2")
11478        self.gguf_writer.add_tokenizer_pre(tokpre)
11479        self.gguf_writer.add_token_list(tokens)
11480        self.gguf_writer.add_token_types(toktypes)
11481        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
11482        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"])
11483        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"])
11484        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"])
11485        special_vocab.add_to_gguf(self.gguf_writer)
11486
11487
11488###### CONVERSION LOGIC ######
11489
11490
11491# tree of lazy tensors
11492class LazyTorchTensor(gguf.LazyBase):
11493    _tensor_type = torch.Tensor
11494    # to keep the type-checker happy
11495    dtype: torch.dtype
11496    shape: torch.Size
11497
11498    # only used when converting a torch.Tensor to a np.ndarray
11499    _dtype_map: dict[torch.dtype, type] = {
11500        torch.float16: np.float16,
11501        torch.float32: np.float32,
11502        torch.uint8: np.uint8,
11503    }
11504
11505    # only used when byteswapping data. Only correct size is needed
11506    _dtype_byteswap_map: dict[torch.dtype, type] = {
11507        torch.float64: np.float64,
11508        torch.float32: np.float32,
11509        torch.bfloat16: np.float16,
11510        torch.float16: np.float16,
11511        torch.int64: np.int64,
11512        torch.uint64: np.uint64,
11513        torch.int32: np.int32,
11514        torch.uint32: np.uint32,
11515        torch.int16: np.int16,
11516        torch.uint16: np.uint16,
11517        torch.int8: np.int8,
11518        torch.uint8: np.uint8,
11519        torch.bool: np.uint8,
11520        torch.float8_e4m3fn: np.uint8,
11521        torch.float8_e5m2: np.uint8,
11522    }
11523
11524    # used for safetensors slices
11525    # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
11526    # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
11527    _dtype_str_map: dict[str, torch.dtype] = {
11528        "F64": torch.float64,
11529        "F32": torch.float32,
11530        "BF16": torch.bfloat16,
11531        "F16": torch.float16,
11532        # "U64": torch.uint64,
11533        "I64": torch.int64,
11534        # "U32": torch.uint32,
11535        "I32": torch.int32,
11536        # "U16": torch.uint16,
11537        "I16": torch.int16,
11538        "U8": torch.uint8,
11539        "I8": torch.int8,
11540        "BOOL": torch.bool,
11541        "F8_E4M3": torch.float8_e4m3fn,
11542        "F8_E5M2": torch.float8_e5m2,
11543    }
11544
11545    def numpy(self) -> gguf.LazyNumpyTensor:
11546        dtype = self._dtype_map[self.dtype]
11547        return gguf.LazyNumpyTensor(
11548            meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
11549            args=(self,),
11550            func=(lambda s: s.numpy())
11551        )
11552
11553    @classmethod
11554    def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
11555        return torch.empty(size=shape, dtype=dtype, device="meta")
11556
11557    @classmethod
11558    def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
11559        dtype = cls._dtype_str_map[st_slice.get_dtype()]
11560        shape: tuple[int, ...] = tuple(st_slice.get_shape())
11561        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
11562        return cast(torch.Tensor, lazy)
11563
11564    @classmethod
11565    def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
11566        def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
11567            def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray:
11568                if sys.byteorder == 'big':
11569                    # switch data back to big endian
11570                    tensor = tensor.view(dtype).byteswap(inplace=False)
11571                return tensor
11572            dtype = cls._dtype_str_map[tensor.dtype]
11573            numpy_dtype = cls._dtype_byteswap_map[dtype]
11574            return torch.from_numpy(byteswap_tensor(tensor.mmap_bytes(), numpy_dtype)).view(dtype).reshape(tensor.shape)
11575        dtype = cls._dtype_str_map[t.dtype]
11576        shape = t.shape
11577        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r))
11578        return cast(torch.Tensor, lazy)
11579
11580    @classmethod
11581    def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
11582        def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray:
11583            if sys.byteorder == 'big':
11584                # switch data back to big endian
11585                tensor = tensor.view(dtype).byteswap(inplace=False)
11586            return tensor
11587        dtype = cls._dtype_str_map[remote_tensor.dtype]
11588        numpy_dtype = cls._dtype_byteswap_map[dtype]
11589        shape = remote_tensor.shape
11590        meta = cls.meta_with_dtype_and_shape(dtype, shape)
11591        lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.from_numpy(byteswap_tensor(np.frombuffer(r.data(), dtype=numpy_dtype), numpy_dtype)).view(dtype).reshape(shape))
11592        return cast(torch.Tensor, lazy)
11593
11594    @classmethod
11595    def __torch_function__(cls, func, types, args=(), kwargs=None):
11596        del types  # unused
11597
11598        if kwargs is None:
11599            kwargs = {}
11600
11601        if func is torch.Tensor.numpy:
11602            return args[0].numpy()
11603
11604        return cls._wrap_fn(func)(*args, **kwargs)
11605
11606
11607def parse_args() -> argparse.Namespace:
11608    parser = argparse.ArgumentParser(
11609        description="Convert a huggingface model to a GGML compatible file")
11610    parser.add_argument(
11611        "--vocab-only", action="store_true",
11612        help="extract only the vocab",
11613    )
11614    parser.add_argument(
11615        "--outfile", type=Path,
11616        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
11617    )
11618    parser.add_argument(
11619        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto",
11620        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type",
11621    )
11622    parser.add_argument(
11623        "--bigendian", action="store_true",
11624        help="model is executed on big endian machine",
11625    )
11626    parser.add_argument(
11627        "model", type=str,
11628        help="directory containing model file or huggingface repository ID (if --remote)",
11629        nargs="?",
11630    )
11631    parser.add_argument(
11632        "--use-temp-file", action="store_true",
11633        help="use the tempfile library while processing (helpful when running out of memory, process killed)",
11634    )
11635    parser.add_argument(
11636        "--no-lazy", action="store_true",
11637        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
11638    )
11639    parser.add_argument(
11640        "--model-name", type=str, default=None,
11641        help="name of the model",
11642    )
11643    parser.add_argument(
11644        "--verbose", action="store_true",
11645        help="increase output verbosity",
11646    )
11647    parser.add_argument(
11648        "--split-max-tensors", type=int, default=0,
11649        help="max tensors in each split",
11650    )
11651    parser.add_argument(
11652        "--split-max-size", type=str, default="0",
11653        help="max size per split N(M|G)",
11654    )
11655    parser.add_argument(
11656        "--dry-run", action="store_true",
11657        help="only print out a split plan and exit, without writing any new files",
11658    )
11659    parser.add_argument(
11660        "--no-tensor-first-split", action="store_true",
11661        help="do not add tensors to the first split (disabled by default)"
11662    )
11663    parser.add_argument(
11664        "--metadata", type=Path,
11665        help="Specify the path for an authorship metadata override file"
11666    )
11667    parser.add_argument(
11668        "--print-supported-models", action="store_true",
11669        help="Print the supported models"
11670    )
11671    parser.add_argument(
11672        "--remote", action="store_true",
11673        help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.",
11674    )
11675    parser.add_argument(
11676        "--mmproj", action="store_true",
11677        help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
11678    )
11679    parser.add_argument(
11680        "--mistral-format", action="store_true",
11681        help="Whether the model is stored following the Mistral format.",
11682    )
11683    parser.add_argument(
11684        "--disable-mistral-community-chat-template", action="store_true",
11685        help=(
11686            "Whether to disable usage of Mistral community chat templates. If set, use the Mistral official `mistral-common` library for tokenization and detokenization of Mistral models. "
11687            "Using `mistral-common` ensure correctness and zero-day support of tokenization for models converted from the Mistral format but requires to manually setup the tokenization server."
11688        )
11689    )
11690
11691    parser.add_argument(
11692        "--sentence-transformers-dense-modules", action="store_true",
11693        help=("Whether to include sentence-transformers dense modules. "
11694              "It can be used for sentence-transformers models, like google/embeddinggemma-300m. "
11695              "Default these modules are not included.")
11696    )
11697
11698    args = parser.parse_args()
11699    if not args.print_supported_models and args.model is None:
11700        parser.error("the following arguments are required: model")
11701    return args
11702
11703
11704def split_str_to_n_bytes(split_str: str) -> int:
11705    if split_str.endswith("K"):
11706        n = int(split_str[:-1]) * 1000
11707    elif split_str.endswith("M"):
11708        n = int(split_str[:-1]) * 1000 * 1000
11709    elif split_str.endswith("G"):
11710        n = int(split_str[:-1]) * 1000 * 1000 * 1000
11711    elif split_str.isnumeric():
11712        n = int(split_str)
11713    else:
11714        raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
11715
11716    if n < 0:
11717        raise ValueError(f"Invalid split size: {split_str}, must be positive")
11718
11719    return n
11720
11721
11722def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
11723    # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
11724    # maybe we should fallback to text model's arch in that case, since not many models have both
11725    text_config = hparams.get("text_config", {})
11726    vision_config = hparams.get("vision_config", {})
11727    arch = None
11728    if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
11729        arch = arches[0]
11730    elif "ssm_cfg" in hparams:
11731        # For non-hf Mamba and Mamba2 models
11732        arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
11733
11734    # if "architectures" is found in the sub-config, use that instead
11735    if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
11736        arch = text_config["architectures"][0]
11737    elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
11738        arch = vision_config["architectures"][0]
11739    if arch is None:
11740        raise ValueError("Failed to detect model architecture")
11741    return arch
11742
11743
11744def main() -> None:
11745    args = parse_args()
11746
11747    if args.print_supported_models:
11748        logger.error("Supported models:")
11749        ModelBase.print_registered_models()
11750        sys.exit(0)
11751
11752    if args.verbose:
11753        logging.basicConfig(level=logging.DEBUG)
11754    else:
11755        logging.basicConfig(level=logging.INFO)
11756
11757    if args.remote:
11758        hf_repo_id = args.model
11759        from huggingface_hub import snapshot_download
11760        allowed_patterns = ["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"]
11761        if args.sentence_transformers_dense_modules:
11762            # include sentence-transformers dense modules safetensors files
11763            allowed_patterns.append("*.safetensors")
11764        local_dir = snapshot_download(
11765            repo_id=hf_repo_id,
11766            allow_patterns=allowed_patterns)
11767        dir_model = Path(local_dir)
11768        logger.info(f"Downloaded config and tokenizer to {local_dir}")
11769    else:
11770        hf_repo_id = None
11771        dir_model = Path(args.model)
11772
11773    if not dir_model.is_dir():
11774        logger.error(f'Error: {dir_model} is not a directory')
11775        sys.exit(1)
11776
11777    ftype_map: dict[str, gguf.LlamaFileType] = {
11778        "f32": gguf.LlamaFileType.ALL_F32,
11779        "f16": gguf.LlamaFileType.MOSTLY_F16,
11780        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
11781        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
11782        "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
11783        "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
11784        "auto": gguf.LlamaFileType.GUESSED,
11785    }
11786
11787    is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
11788    if args.use_temp_file and is_split:
11789        logger.error("Error: Cannot use temp file when splitting")
11790        sys.exit(1)
11791
11792    if args.outfile is not None:
11793        fname_out = args.outfile
11794    elif hf_repo_id:
11795        # if remote, use the model ID as the output file name
11796        fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
11797    else:
11798        fname_out = dir_model
11799
11800    logger.info(f"Loading model: {dir_model.name}")
11801
11802    is_mistral_format = args.mistral_format
11803    if is_mistral_format and not _mistral_common_installed:
11804        raise ImportError(_mistral_import_error_msg)
11805    disable_mistral_community_chat_template = args.disable_mistral_community_chat_template
11806
11807    with torch.inference_mode():
11808        output_type = ftype_map[args.outtype]
11809        model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
11810        hparams = ModelBase.load_hparams(dir_model, is_mistral_format)
11811        if not is_mistral_format:
11812            model_architecture = get_model_architecture(hparams, model_type)
11813            logger.info(f"Model architecture: {model_architecture}")
11814            try:
11815                model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
11816            except NotImplementedError:
11817                logger.error(f"Model {model_architecture} is not supported")
11818                sys.exit(1)
11819        elif args.mmproj:
11820            assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
11821            model_class = PixtralModel
11822        elif "moe" in hparams:
11823            model_class = MistralMoeModel
11824        else:
11825            model_class = MistralModel
11826
11827        model_instance = model_class(dir_model, output_type, fname_out,
11828                                     is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
11829                                     eager=args.no_lazy,
11830                                     metadata_override=args.metadata, model_name=args.model_name,
11831                                     split_max_tensors=args.split_max_tensors,
11832                                     split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
11833                                     small_first_shard=args.no_tensor_first_split,
11834                                     remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
11835                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
11836                                     )
11837
11838        if args.vocab_only:
11839            logger.info("Exporting model vocab...")
11840            model_instance.write_vocab()
11841            logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
11842        else:
11843            logger.info("Exporting model...")
11844            model_instance.write()
11845            out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
11846            logger.info(f"Model successfully exported to {out_path}")
11847
11848
11849if __name__ == '__main__':
11850    main()